From 712d720fa58ced8a0922aa107967f64d453ef857 Mon Sep 17 00:00:00 2001
From: "deepin-community-bot[bot]"
<156989552+deepin-community-bot[bot]@users.noreply.github.com>
Date: Tue, 30 Jun 2026 06:03:49 +0000
Subject: [PATCH] feat: update python-charset-normalizer to 3.4.2-1
---
.codecov.yml | 8 -
.coveragerc | 28 ++-
.pre-commit-config.yaml | 30 +++
.readthedocs.yaml | 4 +-
CHANGELOG.md | 47 +++-
CONTRIBUTING.md | 14 +-
LICENSE | 4 +-
MANIFEST.in | 2 +-
README.md | 21 +-
UPGRADE.md | 31 ---
bin/bc.py | 73 +++---
bin/coverage.py | 80 +++---
bin/integration.py | 54 ----
bin/performance.py | 20 +-
bin/run_autofix.sh | 12 -
bin/run_checks.sh | 15 --
bin/serve.py | 37 ---
build-requirements.txt | 6 -
charset_normalizer/version.py | 6 -
debian/.gitignore | 1 +
debian/changelog | 50 ++++
debian/control | 11 +-
debian/copyright | 2 +-
...ucible-assure-stable-order-use-a-tup.patch | 23 ++
...ible-privacy-issue-to-a-linked-image.patch | 2 +-
debian/patches/mypyc-debug-symbols | 22 ++
debian/patches/series | 2 +
debian/rules | 14 +-
debian/tests/upstream-tests | 2 +-
debian/upstream/metadata | 9 +-
debian/watch | 2 +-
dev-requirements.txt | 16 +-
docs/community/featured.rst | 5 +-
docs/index.rst | 2 +-
noxfile.py | 234 ++++++++++++++++++
pyproject.toml | 85 +++++++
setup.cfg | 70 ------
setup.py | 28 +--
.../charset_normalizer}/__init__.py | 4 +-
.../charset_normalizer}/__main__.py | 2 +
.../charset_normalizer}/api.py | 136 ++++++----
.../charset_normalizer}/cd.py | 68 ++---
.../charset_normalizer}/cli/__init__.py | 2 +
.../charset_normalizer}/cli/__main__.py | 111 ++++++++-
.../charset_normalizer}/constant.py | 46 +++-
.../charset_normalizer}/legacy.py | 18 +-
.../charset_normalizer}/md.py | 98 +++++---
.../charset_normalizer}/models.py | 106 ++++----
.../charset_normalizer}/py.typed | 0
.../charset_normalizer}/utils.py | 81 +++---
src/charset_normalizer/version.py | 8 +
tests/__init__.py | 1 -
tests/test_base_detection.py | 115 ++++++---
tests/test_cli.py | 122 +++------
tests/test_coherence_detection.py | 76 +++++-
tests/test_detect_legacy.py | 76 ++----
tests/test_edge_case.py | 57 ++++-
tests/test_full_detection.py | 65 ++---
tests/test_isbinary.py | 27 +-
tests/test_large_payload.py | 36 ++-
tests/test_logging.py | 25 +-
tests/test_mess_detection.py | 51 ++--
tests/test_preemptive_detection.py | 86 ++++++-
tests/test_utils.py | 6 +-
64 files changed, 1587 insertions(+), 908 deletions(-)
delete mode 100644 .codecov.yml
create mode 100644 .pre-commit-config.yaml
delete mode 100644 UPGRADE.md
delete mode 100644 bin/integration.py
delete mode 100755 bin/run_autofix.sh
delete mode 100755 bin/run_checks.sh
delete mode 100644 bin/serve.py
delete mode 100644 build-requirements.txt
delete mode 100644 charset_normalizer/version.py
create mode 100644 debian/.gitignore
create mode 100644 debian/patches/Make-the-build-reproducible-assure-stable-order-use-a-tup.patch
create mode 100644 debian/patches/mypyc-debug-symbols
create mode 100644 noxfile.py
create mode 100644 pyproject.toml
delete mode 100644 setup.cfg
rename {charset_normalizer => src/charset_normalizer}/__init__.py (97%)
rename {charset_normalizer => src/charset_normalizer}/__main__.py (66%)
rename {charset_normalizer => src/charset_normalizer}/api.py (84%)
rename {charset_normalizer => src/charset_normalizer}/cd.py (86%)
rename {charset_normalizer => src/charset_normalizer}/cli/__init__.py (73%)
rename {charset_normalizer => src/charset_normalizer}/cli/__main__.py (71%)
rename {charset_normalizer => src/charset_normalizer}/constant.py (93%)
rename {charset_normalizer => src/charset_normalizer}/legacy.py (81%)
rename {charset_normalizer => src/charset_normalizer}/md.py (87%)
rename {charset_normalizer => src/charset_normalizer}/models.py (77%)
rename {charset_normalizer => src/charset_normalizer}/py.typed (100%)
rename {charset_normalizer => src/charset_normalizer}/utils.py (84%)
create mode 100644 src/charset_normalizer/version.py
diff --git a/.codecov.yml b/.codecov.yml
deleted file mode 100644
index f307895..0000000
--- a/.codecov.yml
+++ /dev/null
@@ -1,8 +0,0 @@
-coverage:
- status:
- project:
- default:
- target: 88%
- threshold: null
- patch: false
- changes: false
diff --git a/.coveragerc b/.coveragerc
index 723bfd0..8b59131 100644
--- a/.coveragerc
+++ b/.coveragerc
@@ -1,2 +1,28 @@
[run]
-source=charset_normalizer
+source =
+ charset_normalizer
+# Needed for Python 3.11 and lower
+disable_warnings = no-sysmon
+
+[paths]
+source =
+ src/charset_normalizer
+ */charset_normalizer
+ *\charset_normalizer
+
+[report]
+omit =
+ src/charset_normalizer/__main__.py
+
+exclude_lines =
+ except ModuleNotFoundError:
+ except ImportError:
+ pass
+ import
+ raise NotImplementedError
+ .* # Platform-specific.*
+ .*:.* # Python \d.*
+ .* # Abstract
+ .* # Defensive:
+ if (?:typing.)?TYPE_CHECKING:
+ ^\s*?\.\.\.\s*$
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
new file mode 100644
index 0000000..93b904c
--- /dev/null
+++ b/.pre-commit-config.yaml
@@ -0,0 +1,30 @@
+exclude: 'docs/|data/|tests/'
+
+repos:
+ - repo: https://github.com/pre-commit/pre-commit-hooks
+ rev: v5.0.0
+ hooks:
+ - id: check-yaml
+ - id: debug-statements
+ - id: end-of-file-fixer
+ - id: trailing-whitespace
+ - repo: https://github.com/asottile/pyupgrade
+ rev: v3.19.1
+ hooks:
+ - id: pyupgrade
+ args: [ --py37-plus, --keep-runtime-typing ]
+ - repo: https://github.com/astral-sh/ruff-pre-commit
+ # Ruff version.
+ rev: v0.9.1
+ hooks:
+ # Run the linter.
+ - id: ruff
+ args: [ --fix ]
+ # Run the formatter.
+ - id: ruff-format
+ - repo: https://github.com/pre-commit/mirrors-mypy
+ rev: v1.14.1
+ hooks:
+ - id: mypy
+ args: [ --check-untyped-defs ]
+ exclude: 'tests/|noxfile.py|setup.py|bin/'
diff --git a/.readthedocs.yaml b/.readthedocs.yaml
index 87d8ed3..b783a32 100644
--- a/.readthedocs.yaml
+++ b/.readthedocs.yaml
@@ -1,9 +1,9 @@
version: 2
build:
- os: ubuntu-20.04
+ os: ubuntu-22.04
tools:
- python: "3.9"
+ python: "3.10"
# Build documentation in the docs/ directory with Sphinx
sphinx:
diff --git a/CHANGELOG.md b/CHANGELOG.md
index de66da4..9e78f38 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -2,6 +2,47 @@
All notable changes to charset-normalizer will be documented in this file. This project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/).
+## [3.4.2](https://github.com/Ousret/charset_normalizer/compare/3.4.1...3.4.2) (2025-05-02)
+
+### Fixed
+- Addressed the DeprecationWarning in our CLI regarding `argparse.FileType` by backporting the target class into the package. (#591)
+- Improved the overall reliability of the detector with CJK Ideographs. (#605) (#587)
+
+### Changed
+- Optional mypyc compilation upgraded to version 1.15 for Python >= 3.8
+
+## [3.4.1](https://github.com/Ousret/charset_normalizer/compare/3.4.0...3.4.1) (2024-12-24)
+
+### Changed
+- Project metadata are now stored using `pyproject.toml` instead of `setup.cfg` using setuptools as the build backend.
+- Enforce annotation delayed loading for a simpler and consistent types in the project.
+- Optional mypyc compilation upgraded to version 1.14 for Python >= 3.8
+
+### Added
+- pre-commit configuration.
+- noxfile.
+
+### Removed
+- `build-requirements.txt` as per using `pyproject.toml` native build configuration.
+- `bin/integration.py` and `bin/serve.py` in favor of downstream integration test (see noxfile).
+- `setup.cfg` in favor of `pyproject.toml` metadata configuration.
+- Unused `utils.range_scan` function.
+
+### Fixed
+- Converting content to Unicode bytes may insert `utf_8` instead of preferred `utf-8`. (#572)
+- Deprecation warning "'count' is passed as positional argument" when converting to Unicode bytes on Python 3.13+
+
+## [3.4.0](https://github.com/Ousret/charset_normalizer/compare/3.3.2...3.4.0) (2024-10-08)
+
+### Added
+- Argument `--no-preemptive` in the CLI to prevent the detector to search for hints.
+- Support for Python 3.13 (#512)
+
+### Fixed
+- Relax the TypeError exception thrown when trying to compare a CharsetMatch with anything else than a CharsetMatch.
+- Improved the general reliability of the detector based on user feedbacks. (#520) (#509) (#498) (#407) (#537)
+- Declared charset in content (preemptive detection) not changed when converting to utf-8 bytes. (#381)
+
## [3.3.2](https://github.com/Ousret/charset_normalizer/compare/3.3.1...3.3.2) (2023-10-31)
### Fixed
@@ -170,7 +211,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/).
## [2.0.12](https://github.com/Ousret/charset_normalizer/compare/2.0.11...2.0.12) (2022-02-12)
### Fixed
-- ASCII miss-detection on rare cases (PR #170)
+- ASCII miss-detection on rare cases (PR #170)
## [2.0.11](https://github.com/Ousret/charset_normalizer/compare/2.0.10...2.0.11) (2022-01-30)
@@ -202,7 +243,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/).
- MD improvement on trailing data and long foreign (non-pure latin) data (PR #124)
- Efficiency improvements in cd/alphabet_languages from [@adbar](https://github.com/adbar) (PR #122)
- call sum() without an intermediary list following PEP 289 recommendations from [@adbar](https://github.com/adbar) (PR #129)
-- Code style as refactored by Sourcery-AI (PR #131)
+- Code style as refactored by Sourcery-AI (PR #131)
- Minor adjustment on the MD around european words (PR #133)
- Remove and replace SRTs from assets / tests (PR #139)
- Initialize the library logger with a `NullHandler` by default from [@nmaynes](https://github.com/nmaynes) (PR #135)
@@ -275,7 +316,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/).
## [2.0.2](https://github.com/Ousret/charset_normalizer/compare/2.0.1...2.0.2) (2021-07-15)
### Fixed
-- Empty/Too small JSON payload miss-detection fixed. Report from [@tseaver](https://github.com/tseaver) (PR #59)
+- Empty/Too small JSON payload miss-detection fixed. Report from [@tseaver](https://github.com/tseaver) (PR #59)
### Changed
- Don't inject unicodedata2 into sys.modules from [@akx](https://github.com/akx) (PR #57)
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index abee674..40b19f0 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -1,12 +1,12 @@
# Contribution Guidelines
-If you’re reading this, you’re probably interested in contributing to Charset Normalizer.
-Thank you very much! Open source projects live-and-die based on the support they receive from others,
+If you’re reading this, you’re probably interested in contributing to Charset Normalizer.
+Thank you very much! Open source projects live-and-die based on the support they receive from others,
and the fact that you’re even considering contributing to this project is very generous of you.
## Questions
-The GitHub issue tracker is for *bug reports* and *feature requests*.
+The GitHub issue tracker is for *bug reports* and *feature requests*.
Questions are allowed only when no answer are provided in docs.
## Good Bug Reports
@@ -67,6 +67,10 @@ the backward-compatibility.
## How to run tests locally?
It is essential that you run, prior to any submissions the mandatory checks.
-Run the script `./bin/run_checks.sh` to verify that your modification are not breaking anything.
-Also, make sure to run the `./bin/run_autofix.sh` to comply with the style format and import sorting.
+```shell
+pip install nox
+nox -s test
+nox -s lint
+nox -s coverage
+```
diff --git a/LICENSE b/LICENSE
index ad82355..9725772 100644
--- a/LICENSE
+++ b/LICENSE
@@ -1,6 +1,6 @@
MIT License
-Copyright (c) 2019 TAHRI Ahmed R.
+Copyright (c) 2025 TAHRI Ahmed R.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
@@ -18,4 +18,4 @@ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-SOFTWARE.
\ No newline at end of file
+SOFTWARE.
diff --git a/MANIFEST.in b/MANIFEST.in
index 3792f5b..8da2cd0 100644
--- a/MANIFEST.in
+++ b/MANIFEST.in
@@ -1,4 +1,4 @@
-include LICENSE README.md CHANGELOG.md charset_normalizer/py.typed dev-requirements.txt
+include LICENSE README.md CHANGELOG.md src/charset_normalizer/py.typed dev-requirements.txt SECURITY.md noxfile.py
recursive-include data *.md
recursive-include data *.txt
recursive-include docs *
diff --git a/README.md b/README.md
index 13e6e14..ee5b2e7 100644
--- a/README.md
+++ b/README.md
@@ -15,7 +15,7 @@
Featured Packages
-
+
@@ -55,8 +55,7 @@ This project offers you an alternative to **Universal Charset Encoding Detector*

-*\*\* : They are clearly using specific code for a specific encoding even if covering most of used one*
-Did you got there because of the logs? See [https://charset-normalizer.readthedocs.io/en/latest/user/miscellaneous.html](https://charset-normalizer.readthedocs.io/en/latest/user/miscellaneous.html)
+*\*\* : They are clearly using specific code for a specific encoding even if covering most of used one*
## ⚡ Performance
@@ -64,21 +63,23 @@ This package offer better performance than its counterpart Chardet. Here are som
| Package | Accuracy | Mean per file (ms) | File per sec (est) |
|-----------------------------------------------|:--------:|:------------------:|:------------------:|
-| [chardet](https://github.com/chardet/chardet) | 86 % | 200 ms | 5 file/sec |
+| [chardet](https://github.com/chardet/chardet) | 86 % | 63 ms | 16 file/sec |
| charset-normalizer | **98 %** | **10 ms** | 100 file/sec |
| Package | 99th percentile | 95th percentile | 50th percentile |
|-----------------------------------------------|:---------------:|:---------------:|:---------------:|
-| [chardet](https://github.com/chardet/chardet) | 1200 ms | 287 ms | 23 ms |
+| [chardet](https://github.com/chardet/chardet) | 265 ms | 71 ms | 7 ms |
| charset-normalizer | 100 ms | 50 ms | 5 ms |
+_updated as of december 2024 using CPython 3.12_
+
Chardet's performance on larger file (1MB+) are very poor. Expect huge difference on large payload.
> Stats are generated using 400+ files using default parameters. More details on used files, see GHA workflows.
> And yes, these results might change at any time. The dataset can be updated to include more files.
> The actual delays heavily depends on your CPU capabilities. The factors should remain the same.
> Keep in mind that the stats are generous and that Chardet accuracy vs our is measured using Chardet initial capability
-> (eg. Supported Encoding) Challenge-them if you want.
+> (e.g. Supported Encoding) Challenge-them if you want.
## ✨ Installation
@@ -195,11 +196,11 @@ reliable alternative using a completely different method. Also! I never back dow
I **don't care** about the **originating charset** encoding, because **two different tables** can
produce **two identical rendered string.**
-What I want is to get readable text, the best I can.
+What I want is to get readable text, the best I can.
In a way, **I'm brute forcing text decoding.** How cool is that ? 😎
-Don't confuse package **ftfy** with charset-normalizer or chardet. ftfy goal is to repair unicode string whereas charset-normalizer to convert raw file in unknown encoding to unicode.
+Don't confuse package **ftfy** with charset-normalizer or chardet. ftfy goal is to repair Unicode string whereas charset-normalizer to convert raw file in unknown encoding to unicode.
## 🍰 How
@@ -211,7 +212,7 @@ Don't confuse package **ftfy** with charset-normalizer or chardet. ftfy goal is
**Wait a minute**, what is noise/mess and coherence according to **YOU ?**
*Noise :* I opened hundred of text files, **written by humans**, with the wrong encoding table. **I observed**, then
-**I established** some ground rules about **what is obvious** when **it seems like** a mess.
+**I established** some ground rules about **what is obvious** when **it seems like** a mess (aka. defining noise in rendered text).
I know that my interpretation of what is noise is probably incomplete, feel free to contribute in order to
improve or rewrite it.
@@ -255,3 +256,5 @@ from the experts who know it best, while seamlessly integrating with existing
tools.
[1]: https://tidelift.com/subscription/pkg/pypi-charset-normalizer?utm_source=pypi-charset-normalizer&utm_medium=readme
+
+[](https://www.bestpractices.dev/projects/7297)
diff --git a/UPGRADE.md b/UPGRADE.md
deleted file mode 100644
index 4b8f7bb..0000000
--- a/UPGRADE.md
+++ /dev/null
@@ -1,31 +0,0 @@
-Guide to upgrade your code from v1 to v2
-----------------------------------------
-
- * If you are using the legacy `detect` function, that is it. You have nothing to do.
-
-## Detection
-
-### Before
-
-```python
-from charset_normalizer import CharsetNormalizerMatches
-
-results = CharsetNormalizerMatches.from_bytes(
- '我没有埋怨,磋砣的只是一些时间。'.encode('utf_32')
-)
-```
-
-### After
-
-```python
-from charset_normalizer import from_bytes
-
-results = from_bytes(
- '我没有埋怨,磋砣的只是一些时间。'.encode('utf_32')
-)
-```
-
-Methods that once were staticmethods of the class `CharsetNormalizerMatches` are now basic functions.
-`from_fp`, `from_bytes`, `from_fp` and `` are concerned.
-
-Staticmethods scheduled to be removed in version 3.0
diff --git a/bin/bc.py b/bin/bc.py
index df28943..ba4ee41 100644
--- a/bin/bc.py
+++ b/bin/bc.py
@@ -1,13 +1,13 @@
-#!/bin/python
+from __future__ import annotations
+
+import argparse
from glob import glob
from os.path import isdir
from sys import argv
-from typing import List
-import argparse
-from charset_normalizer import detect as tbt_detect
from chardet import detect as chardet_detect
+from charset_normalizer import detect as tbt_detect
from charset_normalizer.utils import iana_name
@@ -16,28 +16,35 @@ def calc_equivalence(content: bytes, cp_a: str, cp_b: str):
str_a = content.decode(cp_a)
str_b = content.decode(cp_b)
except UnicodeDecodeError:
- return 0.
+ return 0.0
character_count = len(str_a)
- diff_character_count = sum(
- chr_a != chr_b for chr_a, chr_b in zip(str_a, str_b)
- )
+ diff_character_count = sum(chr_a != chr_b for chr_a, chr_b in zip(str_a, str_b))
- return 1. - (diff_character_count / character_count)
+ return 1.0 - (diff_character_count / character_count)
-def cli_bc(arguments: List[str]):
+def cli_bc(arguments: list[str]):
parser = argparse.ArgumentParser(
description="BC script checker for Charset-Normalizer with Chardet"
)
- parser.add_argument('-c', '--coverage', action="store", default=85, type=int, dest='coverage',
- help="Define the minimum acceptable coverage to succeed")
+ parser.add_argument(
+ "-c",
+ "--coverage",
+ action="store",
+ default=85,
+ type=int,
+ dest="coverage",
+ help="Define the minimum acceptable coverage to succeed",
+ )
args = parser.parse_args(arguments)
if not isdir("./char-dataset"):
- print("This script require https://github.com/Ousret/char-dataset to be cloned on package root directory")
+ print(
+ "This script require https://github.com/Ousret/char-dataset to be cloned on package root directory"
+ )
exit(1)
success_count = 0
@@ -50,44 +57,52 @@ def cli_bc(arguments: List[str]):
content = fp.read()
chardet_result = chardet_detect(content)
- chardet_encoding = chardet_result['encoding']
+ chardet_encoding = chardet_result["encoding"]
charset_normalizer_result = tbt_detect(content)
- charset_normalizer_encoding = charset_normalizer_result['encoding']
+ charset_normalizer_encoding = charset_normalizer_result["encoding"]
if [chardet_encoding, charset_normalizer_encoding].count(None) == 1:
- print("⚡⚡ '{}' (BC-Break) New('{}') vs Legacy('{}')".format(tbt_path, charset_normalizer_encoding, chardet_encoding))
+ print(
+ f"⚡⚡ '{tbt_path}' (BC-Break) New('{charset_normalizer_encoding}') vs Legacy('{chardet_encoding}')"
+ )
continue
if charset_normalizer_encoding == chardet_encoding:
success_count += 1
- print("✅✅ '{}' (BC)".format(tbt_path))
+ print(f"✅✅ '{tbt_path}' (BC)")
continue
- if (chardet_encoding is None and charset_normalizer_encoding is None) or (iana_name(chardet_encoding, False) == iana_name(charset_normalizer_encoding, False)):
+ if (chardet_encoding is None and charset_normalizer_encoding is None) or (
+ iana_name(chardet_encoding, False)
+ == iana_name(charset_normalizer_encoding, False)
+ ):
success_count += 1
- print("✅✅ '{}' (BC)".format(tbt_path))
+ print(f"✅✅ '{tbt_path}' (BC)")
continue
- calc_eq = calc_equivalence(content, chardet_encoding, charset_normalizer_encoding)
+ calc_eq = calc_equivalence(
+ content, chardet_encoding, charset_normalizer_encoding
+ )
if calc_eq >= 0.98:
success_count += 1
- print("️✅ ️'{}' (got '{}' but eq {} WITH {} %)".format(tbt_path, charset_normalizer_encoding, chardet_encoding, round(calc_eq * 100., 3)))
+ print(
+ f"️✅ ️'{tbt_path}' (got '{charset_normalizer_encoding}' but "
+ f"eq {chardet_encoding} WITH {round(calc_eq * 100.0, 3)} %)"
+ )
continue
- print("⚡⚡ '{}' (BC-Break) New('{}') vs Legacy('{}')".format(tbt_path, charset_normalizer_encoding, chardet_encoding))
+ print(
+ f"⚡⚡ '{tbt_path}' (BC-Break) New('{charset_normalizer_encoding}') vs Legacy('{chardet_encoding}')"
+ )
- success_ratio = round(success_count / total_count, 2) * 100.
+ success_ratio = round(success_count / total_count, 2) * 100.0
- print("Total EST BC = {} % ({} / {} files)".format(success_ratio, success_count, total_count))
+ print(f"Total EST BC = {success_ratio} % ({success_count} / {total_count} files)")
return 0 if success_ratio >= args.coverage else 1
if __name__ == "__main__":
- exit(
- cli_bc(
- argv[1:]
- )
- )
+ exit(cli_bc(argv[1:]))
diff --git a/bin/coverage.py b/bin/coverage.py
index e5f07bd..80e1822 100644
--- a/bin/coverage.py
+++ b/bin/coverage.py
@@ -1,43 +1,54 @@
-#!/bin/python
+from __future__ import annotations
+
+import argparse
from glob import glob
+from os import sep
from os.path import isdir
from sys import argv
-from typing import List
-import argparse
-from charset_normalizer import from_path, __version__
+from charset_normalizer import __version__, from_path
from charset_normalizer.utils import iana_name
-from os import sep
-
def calc_equivalence(content: bytes, cp_a: str, cp_b: str):
str_a = content.decode(cp_a)
str_b = content.decode(cp_b)
character_count = len(str_a)
- diff_character_count = sum(
- chr_a != chr_b for chr_a, chr_b in zip(str_a, str_b)
- )
-
+ diff_character_count = sum(chr_a != chr_b for chr_a, chr_b in zip(str_a, str_b))
- return 1. - (diff_character_count / character_count)
+ return 1.0 - (diff_character_count / character_count)
-def cli_coverage(arguments: List[str]):
+def cli_coverage(arguments: list[str]):
parser = argparse.ArgumentParser(
description="Embedded detection success coverage script checker for Charset-Normalizer"
)
- parser.add_argument('-p', '--with-preemptive', action="store_true", default=False, dest='preemptive',
- help='Enable the preemptive scan behaviour during coverage check')
- parser.add_argument('-c', '--coverage', action="store", default=90, type=int, dest='coverage',
- help="Define the minimum acceptable coverage to succeed")
+ parser.add_argument(
+ "-p",
+ "--with-preemptive",
+ action="store_true",
+ default=False,
+ dest="preemptive",
+ help="Enable the preemptive scan behaviour during coverage check",
+ )
+ parser.add_argument(
+ "-c",
+ "--coverage",
+ action="store",
+ default=90,
+ type=int,
+ dest="coverage",
+ help="Define the minimum acceptable coverage to succeed",
+ )
args = parser.parse_args(arguments)
if not isdir("./char-dataset"):
- print("This script require https://github.com/Ousret/char-dataset to be cloned on package root directory")
+ print(
+ "This script require https://github.com/Ousret/char-dataset to be cloned on package root directory"
+ )
exit(1)
print(f"> using charset-normalizer {__version__}")
@@ -46,28 +57,27 @@ def cli_coverage(arguments: List[str]):
total_count = 0
for tbt_path in sorted(glob("./char-dataset/**/*.*")):
-
expected_encoding = tbt_path.split(sep)[-2]
total_count += 1
- results = from_path(
- tbt_path,
- preemptive_behaviour=args.preemptive
- )
+ results = from_path(tbt_path, preemptive_behaviour=args.preemptive)
if expected_encoding == "None" and len(results) == 0:
- print("✅✅ '{}'".format(tbt_path))
+ print(f"✅✅ '{tbt_path}'")
success_count += 1
continue
if len(results) == 0:
- print("⚡⚡ '{}' (nothing)".format(tbt_path))
+ print(f"⚡⚡ '{tbt_path}' (nothing)")
continue
result = results.best()
- if expected_encoding in result.could_be_from_charset or iana_name(expected_encoding) in result.could_be_from_charset:
- print("✅✅ '{}'".format(tbt_path))
+ if (
+ expected_encoding in result.could_be_from_charset
+ or iana_name(expected_encoding) in result.could_be_from_charset
+ ):
+ print(f"✅✅ '{tbt_path}'")
success_count += 1
continue
@@ -75,21 +85,21 @@ def cli_coverage(arguments: List[str]):
if calc_eq >= 0.98:
success_count += 1
- print("️✅ ️'{}' (got '{}' but equivalence {} %)".format(tbt_path, result.encoding, round(calc_eq * 100., 3)))
+ print(
+ f"️✅ ️'{tbt_path}' (got '{result.encoding}' but equivalence {round(calc_eq * 100.0, 3)} %)"
+ )
continue
- print("⚡ '{}' (got '{}')".format(tbt_path, result.encoding))
+ print(f"⚡ '{tbt_path}' (got '{result.encoding}')")
- success_ratio = round(success_count / total_count, 2) * 100.
+ success_ratio = round(success_count / total_count, 2) * 100.0
- print("Total EST coverage = {} % ({} / {} files)".format(success_ratio, success_count, total_count))
+ print(
+ f"Total EST coverage = {success_ratio} % ({success_count} / {total_count} files)"
+ )
return 0 if success_ratio >= args.coverage else 1
if __name__ == "__main__":
- exit(
- cli_coverage(
- argv[1:]
- )
- )
+ exit(cli_coverage(argv[1:]))
diff --git a/bin/integration.py b/bin/integration.py
deleted file mode 100644
index b186313..0000000
--- a/bin/integration.py
+++ /dev/null
@@ -1,54 +0,0 @@
-from requests import get, __version__
-from typing import List
-from charset_normalizer import detect, __version__ as __version_cn__
-
-if __name__ == "__main__":
-
- print(f"requests {__version__}")
- print(f"charset_normalizer {__version_cn__}")
-
- files: List[str] = get("http://127.0.0.1:8080/").json()
-
- print("## Testing with actual files")
-
- for file in files:
- r = get(
- "http://127.0.0.1:8080/" + file
- )
-
- if r.ok is False:
- print(f"Unable to retrieve '{file}' | HTTP/{r.status_code}")
- exit(1)
-
- expected_encoding = detect(r.content)["encoding"]
-
- if expected_encoding != r.apparent_encoding:
- print(f"Integration test failed | File '{file}' | Expected '{expected_encoding}' got '{r.apparent_encoding}'")
- exit(1)
-
- print(f"✅✅ '{file}' OK")
-
- print("## Testing with edge cases")
-
- # Should NOT crash
- get("http://127.0.0.1:8080/edge/empty/json").json()
-
- print("✅✅ Empty JSON OK")
-
- if get("http://127.0.0.1:8080/edge/empty/plain").apparent_encoding != "utf-8":
- print("Empty payload SHOULD not return apparent_encoding != UTF-8")
- exit(1)
-
- print("✅✅ Empty Plain Text OK")
-
- r = get("http://127.0.0.1:8080/edge/gb18030/json")
-
- if r.apparent_encoding != "GB18030":
- print("JSON Basic Detection FAILURE (/edge/gb18030/json)")
- exit(1)
-
- r.json()
-
- print("✅✅ GB18030 JSON Encoded OK")
-
- print("Integration tests passed!")
diff --git a/bin/performance.py b/bin/performance.py
index ff715fd..3a55c18 100644
--- a/bin/performance.py
+++ b/bin/performance.py
@@ -1,15 +1,16 @@
-#!/bin/python
-from glob import glob
-from time import perf_counter_ns
+from __future__ import annotations
+
import argparse
-from sys import argv
+from glob import glob
+from math import ceil
from os.path import isdir
+from statistics import mean, stdev
+from sys import argv
+from time import perf_counter_ns
-from charset_normalizer import detect
from chardet import detect as chardet_detect
-from statistics import mean, stdev
-from math import ceil
+from charset_normalizer import detect
def calc_percentile(data, percentile):
@@ -66,7 +67,8 @@ def performance_compare(arguments):
charset_normalizer_time = charset_normalizer_time or 0.000005
cn_faster = (chardet_time / charset_normalizer_time) * 100 - 100
print(
- f"{idx+1:>3}/{total_files} {tbt_path:<82} C:{chardet_time:.5f} CN:{charset_normalizer_time:.5f} {cn_faster:.1f} %"
+ f"{idx + 1:>3}/{total_files} {tbt_path:<82} C:{chardet_time:.5f} "
+ f"CN:{charset_normalizer_time:.5f} {cn_faster:.1f} %"
)
# Print the top 10 rows with the slowest execution time
@@ -78,7 +80,7 @@ def performance_compare(arguments):
)
for idx, time in sorted_results[:10]:
tbt_path = file_list[idx]
- print(f"{idx+1:>3}/{total_files} {tbt_path:<82} CN:{time:.5f}")
+ print(f"{idx + 1:>3}/{total_files} {tbt_path:<82} CN:{time:.5f}")
# Print charset normalizer statistics
min_time = min(charset_normalizer_results)
diff --git a/bin/run_autofix.sh b/bin/run_autofix.sh
deleted file mode 100755
index f8a6381..0000000
--- a/bin/run_autofix.sh
+++ /dev/null
@@ -1,12 +0,0 @@
-#!/bin/sh -e
-
-export PREFIX=""
-if [ -d 'venv' ] ; then
- export PREFIX="venv/bin/"
-fi
-
-set -x
-
-${PREFIX}pip install -r ./dev-requirements.txt
-${PREFIX}black --target-version=py37 charset_normalizer
-${PREFIX}isort charset_normalizer
diff --git a/bin/run_checks.sh b/bin/run_checks.sh
deleted file mode 100755
index 951611f..0000000
--- a/bin/run_checks.sh
+++ /dev/null
@@ -1,15 +0,0 @@
-#!/bin/sh -e
-
-export PREFIX=""
-if [ -d 'venv' ] ; then
- export PREFIX="venv/bin/"
-fi
-
-set -x
-
-${PREFIX}pip install -r ./dev-requirements.txt
-${PREFIX}pytest
-${PREFIX}black --check --diff --target-version=py37 charset_normalizer
-${PREFIX}flake8 charset_normalizer
-${PREFIX}mypy charset_normalizer
-${PREFIX}isort --check --diff charset_normalizer
diff --git a/bin/serve.py b/bin/serve.py
deleted file mode 100644
index 0b055ef..0000000
--- a/bin/serve.py
+++ /dev/null
@@ -1,37 +0,0 @@
-from flask import Flask, jsonify, send_from_directory
-from glob import glob
-
-app = Flask(__name__)
-
-
-@app.route('/raw/')
-def read_file(path):
- return send_from_directory('../char-dataset', path, as_attachment=True), 200, {"Content-Type": "text/plain"}
-
-
-@app.route("/")
-def read_targets():
- return jsonify(
- [
- el.replace("./char-dataset", "/raw").replace("\\", "/") for el in sorted(glob("./char-dataset/**/*"))
- ]
- )
-
-
-@app.route("/edge/empty/plain")
-def read_empty_response_plain():
- return b"", 200, {"Content-Type": "text/plain"}
-
-
-@app.route("/edge/empty/json")
-def read_empty_response_json():
- return b"{}", 200, {"Content-Type": "application/json"}
-
-
-@app.route("/edge/gb18030/json")
-def read_gb18030_response_json():
- return '{"abc": "我没有埋怨,磋砣的只是一些时间。。今觀俗士之論也,以族舉德,以位命賢,茲可謂得論之一體矣,而未獲至論之淑真也。"}'.encode("gb18030"), 200, {"Content-Type": "application/json"}
-
-
-if __name__ == "__main__":
- app.run(host="127.0.0.1", port=8080)
diff --git a/build-requirements.txt b/build-requirements.txt
deleted file mode 100644
index 9313e96..0000000
--- a/build-requirements.txt
+++ /dev/null
@@ -1,6 +0,0 @@
-# in the meantime we migrate to pyproject.toml
-# this represent the minimum requirement to build (for the optional speedup)
-mypy==1.6.1; python_version >= '3.8'
-mypy==1.4.1; python_version < '3.8'
-build==0.10.0
-wheel==0.41.2
diff --git a/charset_normalizer/version.py b/charset_normalizer/version.py
deleted file mode 100644
index 5a4da4f..0000000
--- a/charset_normalizer/version.py
+++ /dev/null
@@ -1,6 +0,0 @@
-"""
-Expose version
-"""
-
-__version__ = "3.3.2"
-VERSION = __version__.split(".")
diff --git a/debian/.gitignore b/debian/.gitignore
new file mode 100644
index 0000000..2c8afeb
--- /dev/null
+++ b/debian/.gitignore
@@ -0,0 +1 @@
+/files
diff --git a/debian/changelog b/debian/changelog
index bb3e29d..029ed53 100644
--- a/debian/changelog
+++ b/debian/changelog
@@ -1,3 +1,53 @@
+python-charset-normalizer (3.4.2-1) unstable; urgency=medium
+
+ * Team upload.
+ * New upstream release.
+
+ -- Colin Watson Mon, 05 May 2025 11:40:18 +0100
+
+python-charset-normalizer (3.4.1-1) unstable; urgency=medium
+
+ * Team upload.
+ * New upstream release.
+
+ -- Colin Watson Tue, 18 Feb 2025 13:02:04 +0000
+
+python-charset-normalizer (3.4.0-1) unstable; urgency=medium
+
+ * Team upload.
+ * New upstream version
+ * Refresh patches
+
+ -- Michael R. Crusoe Wed, 23 Oct 2024 18:48:05 +0200
+
+python-charset-normalizer (3.3.2-4) unstable; urgency=medium
+
+ * Team upload.
+ * Add patch to make the build reproducible
+
+ -- Mattia Rizzolo Thu, 19 Sep 2024 17:35:46 +0200
+
+python-charset-normalizer (3.3.2-3) unstable; urgency=medium
+
+ * Team upload.
+ * Break dependency loop (us→sphinx→requests→us).
+ Thank you Samuel Thibault
+ & Adrian Bunk for the fixes.
+ Closes: #1078012
+ * Standards-Version: 4.7.0 (routine-update)
+ * Remove trailing whitespace in debian/copyright (routine-update)
+ * Set upstream metadata fields: Bug-Database, Bug-Submit, Repository, Repository-Browse, Security-Contact.
+
+ -- Michael R. Crusoe Sat, 14 Sep 2024 10:54:30 +0200
+
+python-charset-normalizer (3.3.2-2) unstable; urgency=medium
+
+ * Team upload.
+ * Build binary versions using mypyc to increase performance; as
+ upstream does.
+
+ -- Michael R. Crusoe Sat, 03 Aug 2024 16:31:55 +0200
+
python-charset-normalizer (3.3.2-1) unstable; urgency=medium
* Team upload
diff --git a/debian/control b/debian/control
index 9c7d0b7..c7c7ad2 100644
--- a/debian/control
+++ b/debian/control
@@ -8,13 +8,17 @@ Build-Depends:
dh-sequence-python3,
pybuild-plugin-pyproject,
python3-all,
+ python3-all-dev,
+ python3-mypy,
python3-pytest ,
python3-pytest-cov ,
python3-setuptools,
+ sphinx-common,
+Build-Depends-Indep:
python3-sphinx,
- furo ,
+ furo,
Rules-Requires-Root: no
-Standards-Version: 4.6.2
+Standards-Version: 4.7.0
Homepage: https://github.com/ousret/charset_normalizer
Vcs-Git: https://salsa.debian.org/python-team/packages/python-charset-normalizer.git
Vcs-Browser: https://salsa.debian.org/python-team/packages/python-charset-normalizer
@@ -38,10 +42,11 @@ Description: charset, encoding and language detection for Python (Documentation)
This package contains the documentation.
Package: python3-charset-normalizer
-Architecture: all
+Architecture: any
Depends:
${misc:Depends},
${python3:Depends},
+ ${shlibs:Depends}
Suggests:
python-charset-normalizer-doc,
Description: charset, encoding and language detection (Python 3)
diff --git a/debian/copyright b/debian/copyright
index 8e439c6..383abc2 100644
--- a/debian/copyright
+++ b/debian/copyright
@@ -22,7 +22,7 @@ License: CC-BY-SA-3.0
SERVICES. DISTRIBUTION OF THIS LICENSE DOES NOT CREATE AN ATTORNEY-CLIENT
RELATIONSHIP. CREATIVE COMMONS PROVIDES THIS INFORMATION ON AN "AS-IS"
BASIS. CREATIVE COMMONS MAKES NO WARRANTIES REGARDING THE INFORMATION
- PROVIDED, AND DISCLAIMS LIABILITY FOR DAMAGES RESULTING FROM ITS USE.
+ PROVIDED, AND DISCLAIMS LIABILITY FOR DAMAGES RESULTING FROM ITS USE.
.
License
.
diff --git a/debian/patches/Make-the-build-reproducible-assure-stable-order-use-a-tup.patch b/debian/patches/Make-the-build-reproducible-assure-stable-order-use-a-tup.patch
new file mode 100644
index 0000000..7461c23
--- /dev/null
+++ b/debian/patches/Make-the-build-reproducible-assure-stable-order-use-a-tup.patch
@@ -0,0 +1,23 @@
+From: Mattia Rizzolo
+Date: Thu, 19 Sep 2024 17:31:55 +0200
+Subject: Make the build reproducible: assure stable order,
+ use a tuple instead of a set
+
+Signed-off-by: Mattia Rizzolo
+---
+ src/charset_normalizer/md.py | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+diff --git a/src/charset_normalizer/md.py b/src/charset_normalizer/md.py
+index 12ce024..9f002be 100644
+--- a/src/charset_normalizer/md.py
++++ b/src/charset_normalizer/md.py
+@@ -341,7 +341,7 @@ class SuperWeirdWordPlugin(MessDetectorPlugin):
+ self._buffer_accent_count = 0
+ self._buffer_glyph_count = 0
+ elif (
+- character not in {"<", ">", "-", "=", "~", "|", "_"}
++ character not in ("<", ">", "-", "=", "~", "|", "_")
+ and character.isdigit() is False
+ and is_symbol(character)
+ ):
diff --git a/debian/patches/docs-Prevent-possible-privacy-issue-to-a-linked-image.patch b/debian/patches/docs-Prevent-possible-privacy-issue-to-a-linked-image.patch
index e95aa5c..fd50579 100644
--- a/debian/patches/docs-Prevent-possible-privacy-issue-to-a-linked-image.patch
+++ b/debian/patches/docs-Prevent-possible-privacy-issue-to-a-linked-image.patch
@@ -11,7 +11,7 @@ Foreward: Not-Needed
1 file changed, 5 deletions(-)
diff --git a/docs/index.rst b/docs/index.rst
-index 05d5f98..0762a03 100755
+index da8a9b6..c0c418b 100755
--- a/docs/index.rst
+++ b/docs/index.rst
@@ -11,11 +11,6 @@ All IANA character set names for which the Python core library provides codecs a
diff --git a/debian/patches/mypyc-debug-symbols b/debian/patches/mypyc-debug-symbols
new file mode 100644
index 0000000..d4bc918
--- /dev/null
+++ b/debian/patches/mypyc-debug-symbols
@@ -0,0 +1,22 @@
+From: "Michael R. Crusoe"
+Date: Thu, 19 Sep 2024 17:31:08 +0200
+Subject: Produce debugging symbols
+
+Forwarded: not-needed
+---
+ setup.py | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+diff --git a/setup.py b/setup.py
+index da2a69f..d521f07 100644
+--- a/setup.py
++++ b/setup.py
+@@ -21,7 +21,7 @@ if USE_MYPYC:
+ [
+ "src/charset_normalizer/md.py",
+ ],
+- debug_level="0",
++ debug_level="1",
+ opt_level="3",
+ )
+ else:
diff --git a/debian/patches/series b/debian/patches/series
index 2ecd2c9..7349ea8 100644
--- a/debian/patches/series
+++ b/debian/patches/series
@@ -1 +1,3 @@
+mypyc-debug-symbols
docs-Prevent-possible-privacy-issue-to-a-linked-image.patch
+Make-the-build-reproducible-assure-stable-order-use-a-tup.patch
diff --git a/debian/rules b/debian/rules
index ba918f9..7583222 100755
--- a/debian/rules
+++ b/debian/rules
@@ -11,13 +11,17 @@ SPHINXOPTS := -N -D html_last_updated_fmt="$(BUILD_DATE)"
export PYBUILD_NAME=charset_normalizer
export PYBUILD_BEFORE_TEST=cp -r {dir}/data {build_dir}
export PYBUILD_AFTER_TEST=rm -rf {build_dir}/.coverage {build_dir}/data/
+export CHARSET_NORMALIZER_USE_MYPYC=1
%:
dh $@ --with=sphinxdoc --buildsystem=pybuild
-override_dh_sphinxdoc:
-ifeq (,$(findstring nodoc, $(DEB_BUILD_OPTIONS)))
- PYTHONPATH=. python3 -m sphinx -b html $(SPHINXOPTS) docs/ $(CURDIR)/debian/python-charset-normalizer-doc/usr/share/doc/python-charset-normalizer-doc/html
+override_dh_sphinxdoc-arch:
+
+override_dh_auto_build-indep:
+
+override_dh_auto_test-indep:
+
+execute_before_dh_sphinxdoc-indep:
+ PYTHONPATH=src python3 -m sphinx -b html $(SPHINXOPTS) docs/ $(CURDIR)/debian/python-charset-normalizer-doc/usr/share/doc/python-charset-normalizer-doc/html
rm $(CURDIR)/debian/python-charset-normalizer-doc/usr/share/doc/python-charset-normalizer-doc/html/.nojekyll
- dh_sphinxdoc
-endif
diff --git a/debian/tests/upstream-tests b/debian/tests/upstream-tests
index e0d3d08..7cb5a3d 100644
--- a/debian/tests/upstream-tests
+++ b/debian/tests/upstream-tests
@@ -2,7 +2,7 @@
set -e -u
-cp -a charset_normalizer data setup.cfg tests "${AUTOPKGTEST_TMP}"
+cp -a src data pyproject.toml tests "${AUTOPKGTEST_TMP}"
for py3vers in $(py3versions -s); do
echo
diff --git a/debian/upstream/metadata b/debian/upstream/metadata
index 4b950f4..943919e 100644
--- a/debian/upstream/metadata
+++ b/debian/upstream/metadata
@@ -1,7 +1,8 @@
---
-Bug-Database: https://github.com/Ousret/charset_normalizer/issues
-Bug-Submit: https://github.com/Ousret/charset_normalizer/issues/new
+Bug-Database: https://github.com/jawah/charset_normalizer/issues
+Bug-Submit: https://github.com/jawah/charset_normalizer/issues/new
Documentation: https://charset-normalizer.readthedocs.io/en/latest
FAQ: https://github.com/Ousret/charset_normalizer/blob/master/README.md
-Repository: https://github.com/ousret/charset_normalizer.git
-Repository-Browse: https://github.com/ousret/charset_normalizer
+Repository: https://github.com/jawah/charset_normalizer.git
+Repository-Browse: https://github.com/jawah/charset_normalizer
+Security-Contact: https://github.com/ousret/charset_normalizer/tree/HEAD/SECURITY.md
diff --git a/debian/watch b/debian/watch
index 7f93aba..c17b32f 100644
--- a/debian/watch
+++ b/debian/watch
@@ -4,5 +4,5 @@ opts="mode=git, \
compression=gz, \
uversionmangle=s/(\d)[_\.\-\+]?((RC|rc|pre|dev|beta|alpha)\.?\d*)$/$1~$2/, \
dversionmangle=s/\+(dfsg|ds)(\.?\d+)?$//" \
-https://github.com/Ousret/charset_normalizer.git \
+https://github.com/jawah/charset_normalizer.git \
refs/tags/@ANY_VERSION@
diff --git a/dev-requirements.txt b/dev-requirements.txt
index c25bd74..9c85e11 100644
--- a/dev-requirements.txt
+++ b/dev-requirements.txt
@@ -1,14 +1,2 @@
-flake8==5.0.4
-chardet==5.1.0
-isort==5.11.4
-codecov==2.1.13
-pytest-cov==4.1.0
-build==0.10.0
-wheel==0.41.2
-
-black==23.3.0
-mypy==1.6.1; python_version >= '3.8'
-mypy==1.4.1; python_version < '3.8'
-Flask==2.2.3
-pytest==7.4.3
-requests==2.31.0
+coverage>=7.2.7,<7.9
+pytest>=7.4.4,<9
diff --git a/docs/community/featured.rst b/docs/community/featured.rst
index 8d1814c..a704a0b 100644
--- a/docs/community/featured.rst
+++ b/docs/community/featured.rst
@@ -9,10 +9,7 @@ your level or opinions.
Niquests
--------
-Started as a simple though..
-
-.. image:: https://i.imgflip.com/7xet0f.jpg
- :width: 200
+Started as a simple though.. IE 11 has built-in HTTP/2 support while Requests 2.32 does not!
Most of our programs that interact with HTTP server are built with ``requests`` and
we aren't likely to switch without a substantial effort.
diff --git a/docs/index.rst b/docs/index.rst
index 19ca08a..da8a9b6 100755
--- a/docs/index.rst
+++ b/docs/index.rst
@@ -20,7 +20,7 @@ It aims to be as generic as possible.
It is released under MIT license, see LICENSE for more
details. Be aware that no warranty of any kind is provided with this package.
-Copyright (C) 2023 Ahmed TAHRI
+Copyright (C) 2025 Ahmed TAHRI
Introduction
============
diff --git a/noxfile.py b/noxfile.py
new file mode 100644
index 0000000..bc60155
--- /dev/null
+++ b/noxfile.py
@@ -0,0 +1,234 @@
+from __future__ import annotations
+
+import os
+import shutil
+
+import nox
+
+
+def test_impl(
+ session: nox.Session,
+ use_mypyc: bool = False,
+):
+ # Install deps and the package itself.
+ session.install("-U", "pip", "setuptools", silent=False)
+ session.install("-r", "dev-requirements.txt", silent=False)
+
+ session.install(
+ ".",
+ silent=False,
+ env={"CHARSET_NORMALIZER_USE_MYPYC": "1" if use_mypyc else "0"},
+ )
+
+ # Show the pip version.
+ session.run("pip", "--version")
+ # Print the Python version and bytesize.
+ session.run("python", "--version")
+ # Show charset-normalizer cli info
+ session.run("normalizer", "--version")
+
+ # Inspired from https://hynek.me/articles/ditch-codecov-python/
+ # We use parallel mode and then combine in a later CI step
+ session.run(
+ "python",
+ "-m",
+ "coverage",
+ "run",
+ "--parallel-mode",
+ "-m",
+ "pytest",
+ "-v",
+ "-ra",
+ f"--color={'yes' if 'GITHUB_ACTIONS' in os.environ else 'auto'}",
+ "--tb=native",
+ "--durations=10",
+ "--strict-config",
+ "--strict-markers",
+ *(session.posargs or ("tests/",)),
+ env={
+ "PYTHONWARNINGS": "always::DeprecationWarning",
+ "COVERAGE_CORE": "sysmon",
+ },
+ )
+
+
+@nox.session(
+ python=["3.7", "3.8", "3.9", "3.10", "3.11", "3.12", "3.13", "3.14", "pypy"]
+)
+def test(session: nox.Session) -> None:
+ test_impl(session)
+
+
+@nox.session(python=["3.7", "3.8", "3.9", "3.10", "3.11", "3.12", "3.13"])
+def test_mypyc(session: nox.Session) -> None:
+ test_impl(session, True)
+
+
+def git_clone(session: nox.Session, git_url: str) -> None:
+ """We either clone the target repository or if already exist
+ simply reset the state and pull.
+ """
+ expected_directory = git_url.split("/")[-1]
+
+ if expected_directory.endswith(".git"):
+ expected_directory = expected_directory[:-4]
+
+ if not os.path.isdir(expected_directory):
+ session.run("git", "clone", "--depth", "1", git_url, external=True)
+ else:
+ session.run(
+ "git", "-C", expected_directory, "reset", "--hard", "HEAD", external=True
+ )
+ session.run("git", "-C", expected_directory, "pull", external=True)
+
+
+@nox.session()
+def backward_compatibility(session: nox.Session) -> None:
+ git_clone(session, "https://github.com/ousret/char-dataset")
+
+ # Install deps and the package itself.
+ session.install("-U", "pip", "setuptools", silent=False)
+ session.install("-r", "dev-requirements.txt", silent=False)
+
+ session.install(".", silent=False)
+ session.install("chardet")
+
+ session.run(
+ "python",
+ "bin/bc.py",
+ *(session.posargs or ("--coverage=85",)),
+ )
+
+
+@nox.session()
+def coverage(session: nox.Session) -> None:
+ git_clone(session, "https://github.com/ousret/char-dataset")
+
+ # Install deps and the package itself.
+ session.install("-U", "pip", "setuptools", silent=False)
+ session.install("-r", "dev-requirements.txt", silent=False)
+
+ session.install(".", silent=False)
+
+ # Show the pip version.
+ session.run("pip", "--version")
+ # Print the Python version and bytesize.
+ session.run("python", "--version")
+ # Show charset-normalizer cli info
+ session.run("normalizer", "--version")
+
+ session.run(
+ "python",
+ "-m",
+ "coverage",
+ "run",
+ "--parallel-mode",
+ "bin/coverage.py",
+ *(session.posargs or ("--coverage=90", "--with-preemptive")),
+ )
+
+
+@nox.session()
+def performance(session: nox.Session) -> None:
+ git_clone(session, "https://github.com/ousret/char-dataset")
+
+ # Install deps and the package itself.
+ session.install("-U", "pip", "setuptools", silent=False)
+ session.install("-r", "dev-requirements.txt", silent=False)
+
+ session.install("chardet")
+ session.install(".", silent=False, env={"CHARSET_NORMALIZER_USE_MYPYC": "1"})
+
+ session.run(
+ "python",
+ "bin/performance.py",
+ *(session.posargs or ()),
+ )
+
+
+@nox.session()
+def downstream_niquests(session: nox.Session) -> None:
+ root = os.getcwd()
+ tmp_dir = session.create_tmp()
+
+ session.cd(tmp_dir)
+ git_clone(session, "https://github.com/jawah/niquests")
+ session.chdir("niquests")
+
+ session.run("git", "rev-parse", "HEAD", external=True)
+ session.install(".[socks]", silent=False)
+ session.install("-r", "requirements-dev.txt", silent=False)
+
+ session.cd(root)
+ session.install(".", silent=False)
+ session.cd(f"{tmp_dir}/niquests")
+
+ session.run(
+ "python",
+ "-c",
+ "import charset_normalizer; print(charset_normalizer.__version__)",
+ )
+ session.run(
+ "python",
+ "-m",
+ "pytest",
+ "-v",
+ f"--color={'yes' if 'GITHUB_ACTIONS' in os.environ else 'auto'}",
+ *(session.posargs or ("tests/",)),
+ env={"NIQUESTS_STRICT_OCSP": "1"},
+ )
+
+
+@nox.session()
+def downstream_requests(session: nox.Session) -> None:
+ root = os.getcwd()
+ tmp_dir = session.create_tmp()
+
+ session.cd(tmp_dir)
+ git_clone(session, "https://github.com/psf/requests")
+ session.chdir("requests")
+
+ session.run("git", "rev-parse", "HEAD", external=True)
+ session.install(".[socks]", silent=False)
+ session.install("-r", "requirements-dev.txt", silent=False)
+
+ session.cd(root)
+ session.install(".", silent=False)
+ session.cd(f"{tmp_dir}/requests")
+
+ session.run(
+ "python",
+ "-c",
+ "import charset_normalizer; print(charset_normalizer.__version__)",
+ )
+ session.run(
+ "python",
+ "-m",
+ "pytest",
+ "-v",
+ f"--color={'yes' if 'GITHUB_ACTIONS' in os.environ else 'auto'}",
+ *(session.posargs or ("tests/",)),
+ )
+
+
+@nox.session()
+def format(session: nox.Session) -> None:
+ """Run code formatters."""
+ lint(session)
+
+
+@nox.session
+def lint(session: nox.Session) -> None:
+ session.install("pre-commit")
+ session.run("pre-commit", "run", "--all-files")
+
+
+@nox.session
+def docs(session: nox.Session) -> None:
+ session.install("-r", "docs/requirements.txt")
+ session.install(".")
+
+ session.chdir("docs")
+ if os.path.exists("_build"):
+ shutil.rmtree("_build")
+ session.run("sphinx-build", "-b", "html", "-W", ".", "_build/html")
diff --git a/pyproject.toml b/pyproject.toml
new file mode 100644
index 0000000..34245fa
--- /dev/null
+++ b/pyproject.toml
@@ -0,0 +1,85 @@
+[build-system]
+requires = ["setuptools", "setuptools-scm", "mypy>=1.4.1,<=1.15.0"]
+build-backend = "setuptools.build_meta"
+
+[project]
+name = "charset-normalizer"
+description = "The Real First Universal Charset Detector. Open, modern and actively maintained alternative to Chardet."
+license = {text = "MIT"}
+keywords = ["encoding", "charset", "charset-detector", "detector", "normalization", "unicode", "chardet", "detect"]
+authors = [
+ {name = "Ahmed R. TAHRI", email="tahri.ahmed@proton.me"},
+]
+maintainers = [
+ {name = "Ahmed R. TAHRI", email="tahri.ahmed@proton.me"},
+]
+classifiers = [
+ "Development Status :: 5 - Production/Stable",
+ "Intended Audience :: Developers",
+ "License :: OSI Approved :: MIT License",
+ "Operating System :: OS Independent",
+ "Programming Language :: Python",
+ "Programming Language :: Python :: 3",
+ "Programming Language :: Python :: 3.7",
+ "Programming Language :: Python :: 3.8",
+ "Programming Language :: Python :: 3.9",
+ "Programming Language :: Python :: 3.10",
+ "Programming Language :: Python :: 3.11",
+ "Programming Language :: Python :: 3.12",
+ "Programming Language :: Python :: 3.13",
+ "Programming Language :: Python :: 3 :: Only",
+ "Programming Language :: Python :: Implementation :: CPython",
+ "Programming Language :: Python :: Implementation :: PyPy",
+ "Topic :: Text Processing :: Linguistic",
+ "Topic :: Utilities",
+ "Typing :: Typed",
+]
+requires-python = ">=3.7"
+dynamic = ["version", "readme"]
+
+[project.optional-dependencies]
+unicode_backport = []
+
+[tool.setuptools]
+package-dir = {"" = "src"}
+packages = ["charset_normalizer", "charset_normalizer.cli", ]
+
+[tool.setuptools.dynamic]
+version = {attr = "charset_normalizer.__version__"}
+readme = {file = ["README.md", "CHANGELOG.md", "LICENSE"], content-type = "text/markdown"}
+
+[project.scripts]
+normalizer = "charset_normalizer:cli.cli_detect"
+
+[project.urls]
+"Changelog" = "https://github.com/jawah/charset_normalizer/blob/master/CHANGELOG.md"
+"Documentation" = "https://charset-normalizer.readthedocs.io/"
+"Code" = "https://github.com/jawah/charset_normalizer"
+"Issue tracker" = "https://github.com/jawah/charset_normalizer/issues"
+
+[tool.pytest.ini_options]
+log_level = "DEBUG"
+filterwarnings = [
+ "error",
+]
+
+[tool.isort]
+profile = "black"
+add_imports = "from __future__ import annotations"
+
+[tool.mypy]
+check_untyped_defs = true
+disallow_any_generics = true
+disallow_incomplete_defs = true
+disallow_subclassing_any = true
+disallow_untyped_calls = true
+disallow_untyped_decorators = true
+disallow_untyped_defs = true
+no_implicit_optional = true
+no_implicit_reexport = true
+show_error_codes = true
+strict_equality = true
+warn_redundant_casts = true
+warn_return_any = true
+warn_unused_configs = true
+warn_unused_ignores = false
diff --git a/setup.cfg b/setup.cfg
deleted file mode 100644
index 72fbc05..0000000
--- a/setup.cfg
+++ /dev/null
@@ -1,70 +0,0 @@
-[metadata]
-name = charset-normalizer
-description = The Real First Universal Charset Detector. Open, modern and actively maintained alternative to Chardet.
-long_description = file: README.md, CHANGELOG.md, LICENSE
-long_description_content_type = text/markdown
-keywords = encoding, charset, charset-detector, detector, normalization, unicode, chardet, detect
-url = https://github.com/Ousret/charset_normalizer
-license = MIT
-author_email = ahmed.tahri@cloudnursery.dev
-author = Ahmed TAHRI
-project_urls =
- Bug Reports = https://github.com/Ousret/charset_normalizer/issues
- Documentation = https://charset-normalizer.readthedocs.io/en/latest
-classifiers =
- Development Status :: 5 - Production/Stable
- License :: OSI Approved :: MIT License
- Intended Audience :: Developers
- Topic :: Software Development :: Libraries :: Python Modules
- Operating System :: OS Independent
- Programming Language :: Python
- Programming Language :: Python :: 3
- Programming Language :: Python :: 3.7
- Programming Language :: Python :: 3.8
- Programming Language :: Python :: 3.9
- Programming Language :: Python :: 3.10
- Programming Language :: Python :: 3.11
- Programming Language :: Python :: 3.12
- Programming Language :: Python :: Implementation :: PyPy
- Topic :: Text Processing :: Linguistic
- Topic :: Utilities
- Typing :: Typed
-
-[options.packages.find]
-exclude =
- tests
- *.tests
- *.tests.*
- tests.*
- docs*
- data*
-
-[options.extras_require]
-unicode_backport =
-
-[options.entry_points]
-console_scripts =
- normalizer = charset_normalizer.cli:cli_detect
-
-[options]
-packages = find:
-include_package_data = True
-python_requires = >=3.7.0
-
-[options.package_data]
-charset_normalizer = py.typed
-
-[tool:pytest]
-addopts = --cov=charset_normalizer --cov-report=term-missing -rxXs
-
-[flake8]
-ignore = W503, E203, B305
-max-line-length = 120
-
-[mypy]
-disallow_untyped_defs = True
-ignore_missing_imports = True
-
-[tool:isort]
-profile = black
-combine_as_imports = True
diff --git a/setup.py b/setup.py
index 0ccc7e9..da2a69f 100644
--- a/setup.py
+++ b/setup.py
@@ -1,38 +1,30 @@
#!/usr/bin/env python
-# -*- coding: utf-8 -*-
+from __future__ import annotations
import os
import sys
-from re import search
from setuptools import setup
-
-def get_version():
- with open('charset_normalizer/version.py') as version_file:
- return search(r"""__version__\s+=\s+(['"])(?P.+?)\1""",
- version_file.read()).group('version')
-
-
USE_MYPYC = False
if len(sys.argv) > 1 and sys.argv[1] == "--use-mypyc":
sys.argv.pop(1)
USE_MYPYC = True
-if os.getenv("CHARSET_NORMALIZER_USE_MYPYC", None) == "1":
+elif os.getenv("CHARSET_NORMALIZER_USE_MYPYC", None) == "1":
USE_MYPYC = True
if USE_MYPYC:
from mypyc.build import mypycify
- MYPYC_MODULES = mypycify([
- "charset_normalizer/md.py",
- ], debug_level="0")
+ MYPYC_MODULES = mypycify(
+ [
+ "src/charset_normalizer/md.py",
+ ],
+ debug_level="0",
+ opt_level="3",
+ )
else:
MYPYC_MODULES = None
-setup(
- name="charset-normalizer",
- version=get_version(),
- ext_modules=MYPYC_MODULES
-)
+setup(name="charset-normalizer", ext_modules=MYPYC_MODULES)
diff --git a/charset_normalizer/__init__.py b/src/charset_normalizer/__init__.py
similarity index 97%
rename from charset_normalizer/__init__.py
rename to src/charset_normalizer/__init__.py
index 55991fc..0d3a379 100644
--- a/charset_normalizer/__init__.py
+++ b/src/charset_normalizer/__init__.py
@@ -1,4 +1,3 @@
-# -*- coding: utf-8 -*-
"""
Charset-Normalizer
~~~~~~~~~~~~~~
@@ -19,6 +18,9 @@
:copyright: (c) 2021 by Ahmed TAHRI
:license: MIT, see LICENSE for more details.
"""
+
+from __future__ import annotations
+
import logging
from .api import from_bytes, from_fp, from_path, is_binary
diff --git a/charset_normalizer/__main__.py b/src/charset_normalizer/__main__.py
similarity index 66%
rename from charset_normalizer/__main__.py
rename to src/charset_normalizer/__main__.py
index beae2ef..e0e76f7 100644
--- a/charset_normalizer/__main__.py
+++ b/src/charset_normalizer/__main__.py
@@ -1,3 +1,5 @@
+from __future__ import annotations
+
from .cli import cli_detect
if __name__ == "__main__":
diff --git a/charset_normalizer/api.py b/src/charset_normalizer/api.py
similarity index 84%
rename from charset_normalizer/api.py
rename to src/charset_normalizer/api.py
index 0ba08e3..2c8c061 100644
--- a/charset_normalizer/api.py
+++ b/src/charset_normalizer/api.py
@@ -1,6 +1,8 @@
+from __future__ import annotations
+
import logging
from os import PathLike
-from typing import BinaryIO, List, Optional, Set, Union
+from typing import BinaryIO
from .cd import (
coherence_ratio,
@@ -21,8 +23,6 @@
should_strip_sig_or_bom,
)
-# Will most likely be controversial
-# logging.addLevelName(TRACE, "TRACE")
logger = logging.getLogger("charset_normalizer")
explain_handler = logging.StreamHandler()
explain_handler.setFormatter(
@@ -31,12 +31,12 @@
def from_bytes(
- sequences: Union[bytes, bytearray],
+ sequences: bytes | bytearray,
steps: int = 5,
chunk_size: int = 512,
threshold: float = 0.2,
- cp_isolation: Optional[List[str]] = None,
- cp_exclusion: Optional[List[str]] = None,
+ cp_isolation: list[str] | None = None,
+ cp_exclusion: list[str] | None = None,
preemptive_behaviour: bool = True,
explain: bool = False,
language_threshold: float = 0.1,
@@ -62,7 +62,7 @@ def from_bytes(
if not isinstance(sequences, (bytearray, bytes)):
raise TypeError(
- "Expected object of type bytes or bytearray, got: {0}".format(
+ "Expected object of type bytes or bytearray, got: {}".format(
type(sequences)
)
)
@@ -76,7 +76,7 @@ def from_bytes(
if length == 0:
logger.debug("Encoding detection on empty bytes, assuming utf_8 intention.")
- if explain:
+ if explain: # Defensive: ensure exit path clean handler
logger.removeHandler(explain_handler)
logger.setLevel(previous_logger_level or logging.WARNING)
return CharsetMatches([CharsetMatch(sequences, "utf_8", 0.0, False, [], "")])
@@ -135,9 +135,9 @@ def from_bytes(
),
)
- prioritized_encodings: List[str] = []
+ prioritized_encodings: list[str] = []
- specified_encoding: Optional[str] = (
+ specified_encoding: str | None = (
any_specified_encoding(sequences) if preemptive_behaviour else None
)
@@ -149,16 +149,18 @@ def from_bytes(
specified_encoding,
)
- tested: Set[str] = set()
- tested_but_hard_failure: List[str] = []
- tested_but_soft_failure: List[str] = []
+ tested: set[str] = set()
+ tested_but_hard_failure: list[str] = []
+ tested_but_soft_failure: list[str] = []
- fallback_ascii: Optional[CharsetMatch] = None
- fallback_u8: Optional[CharsetMatch] = None
- fallback_specified: Optional[CharsetMatch] = None
+ fallback_ascii: CharsetMatch | None = None
+ fallback_u8: CharsetMatch | None = None
+ fallback_specified: CharsetMatch | None = None
results: CharsetMatches = CharsetMatches()
+ early_stop_results: CharsetMatches = CharsetMatches()
+
sig_encoding, sig_payload = identify_sig_or_bom(sequences)
if sig_encoding is not None:
@@ -187,7 +189,7 @@ def from_bytes(
tested.add(encoding_iana)
- decoded_payload: Optional[str] = None
+ decoded_payload: str | None = None
bom_or_sig_available: bool = sig_encoding == encoding_iana
strip_sig_or_bom: bool = bom_or_sig_available and should_strip_sig_or_bom(
encoding_iana
@@ -221,16 +223,20 @@ def from_bytes(
try:
if is_too_large_sequence and is_multi_byte_decoder is False:
str(
- sequences[: int(50e4)]
- if strip_sig_or_bom is False
- else sequences[len(sig_payload) : int(50e4)],
+ (
+ sequences[: int(50e4)]
+ if strip_sig_or_bom is False
+ else sequences[len(sig_payload) : int(50e4)]
+ ),
encoding=encoding_iana,
)
else:
decoded_payload = str(
- sequences
- if strip_sig_or_bom is False
- else sequences[len(sig_payload) :],
+ (
+ sequences
+ if strip_sig_or_bom is False
+ else sequences[len(sig_payload) :]
+ ),
encoding=encoding_iana,
)
except (UnicodeDecodeError, LookupError) as e:
@@ -286,7 +292,7 @@ def from_bytes(
early_stop_count: int = 0
lazy_str_hard_failure = False
- md_chunks: List[str] = []
+ md_chunks: list[str] = []
md_ratios = []
try:
@@ -367,7 +373,13 @@ def from_bytes(
and not lazy_str_hard_failure
):
fallback_entry = CharsetMatch(
- sequences, encoding_iana, threshold, False, [], decoded_payload
+ sequences,
+ encoding_iana,
+ threshold,
+ False,
+ [],
+ decoded_payload,
+ preemptive_declaration=specified_encoding,
)
if encoding_iana == specified_encoding:
fallback_specified = fallback_entry
@@ -385,7 +397,7 @@ def from_bytes(
)
if not is_multi_byte_decoder:
- target_languages: List[str] = encoding_languages(encoding_iana)
+ target_languages: list[str] = encoding_languages(encoding_iana)
else:
target_languages = mb_encoding_languages(encoding_iana)
@@ -421,28 +433,58 @@ def from_bytes(
),
)
- results.append(
- CharsetMatch(
- sequences,
- encoding_iana,
- mean_mess_ratio,
- bom_or_sig_available,
- cd_ratios_merged,
- decoded_payload,
- )
+ current_match = CharsetMatch(
+ sequences,
+ encoding_iana,
+ mean_mess_ratio,
+ bom_or_sig_available,
+ cd_ratios_merged,
+ (
+ decoded_payload
+ if (
+ is_too_large_sequence is False
+ or encoding_iana in [specified_encoding, "ascii", "utf_8"]
+ )
+ else None
+ ),
+ preemptive_declaration=specified_encoding,
)
+ results.append(current_match)
+
if (
encoding_iana in [specified_encoding, "ascii", "utf_8"]
and mean_mess_ratio < 0.1
):
+ # If md says nothing to worry about, then... stop immediately!
+ if mean_mess_ratio == 0.0:
+ logger.debug(
+ "Encoding detection: %s is most likely the one.",
+ current_match.encoding,
+ )
+ if explain: # Defensive: ensure exit path clean handler
+ logger.removeHandler(explain_handler)
+ logger.setLevel(previous_logger_level)
+ return CharsetMatches([current_match])
+
+ early_stop_results.append(current_match)
+
+ if (
+ len(early_stop_results)
+ and (specified_encoding is None or specified_encoding in tested)
+ and "ascii" in tested
+ and "utf_8" in tested
+ ):
+ probable_result: CharsetMatch = early_stop_results.best() # type: ignore[assignment]
logger.debug(
- "Encoding detection: %s is most likely the one.", encoding_iana
+ "Encoding detection: %s is most likely the one.",
+ probable_result.encoding,
)
- if explain:
+ if explain: # Defensive: ensure exit path clean handler
logger.removeHandler(explain_handler)
logger.setLevel(previous_logger_level)
- return CharsetMatches([results[encoding_iana]])
+
+ return CharsetMatches([probable_result])
if encoding_iana == sig_encoding:
logger.debug(
@@ -450,7 +492,7 @@ def from_bytes(
"the beginning of the sequence.",
encoding_iana,
)
- if explain:
+ if explain: # Defensive: ensure exit path clean handler
logger.removeHandler(explain_handler)
logger.setLevel(previous_logger_level)
return CharsetMatches([results[encoding_iana]])
@@ -504,8 +546,8 @@ def from_fp(
steps: int = 5,
chunk_size: int = 512,
threshold: float = 0.20,
- cp_isolation: Optional[List[str]] = None,
- cp_exclusion: Optional[List[str]] = None,
+ cp_isolation: list[str] | None = None,
+ cp_exclusion: list[str] | None = None,
preemptive_behaviour: bool = True,
explain: bool = False,
language_threshold: float = 0.1,
@@ -530,12 +572,12 @@ def from_fp(
def from_path(
- path: Union[str, bytes, PathLike], # type: ignore[type-arg]
+ path: str | bytes | PathLike, # type: ignore[type-arg]
steps: int = 5,
chunk_size: int = 512,
threshold: float = 0.20,
- cp_isolation: Optional[List[str]] = None,
- cp_exclusion: Optional[List[str]] = None,
+ cp_isolation: list[str] | None = None,
+ cp_exclusion: list[str] | None = None,
preemptive_behaviour: bool = True,
explain: bool = False,
language_threshold: float = 0.1,
@@ -561,12 +603,12 @@ def from_path(
def is_binary(
- fp_or_path_or_payload: Union[PathLike, str, BinaryIO, bytes], # type: ignore[type-arg]
+ fp_or_path_or_payload: PathLike | str | BinaryIO | bytes, # type: ignore[type-arg]
steps: int = 5,
chunk_size: int = 512,
threshold: float = 0.20,
- cp_isolation: Optional[List[str]] = None,
- cp_exclusion: Optional[List[str]] = None,
+ cp_isolation: list[str] | None = None,
+ cp_exclusion: list[str] | None = None,
preemptive_behaviour: bool = True,
explain: bool = False,
language_threshold: float = 0.1,
diff --git a/charset_normalizer/cd.py b/src/charset_normalizer/cd.py
similarity index 86%
rename from charset_normalizer/cd.py
rename to src/charset_normalizer/cd.py
index 4ea6760..71a3ed5 100644
--- a/charset_normalizer/cd.py
+++ b/src/charset_normalizer/cd.py
@@ -1,8 +1,10 @@
+from __future__ import annotations
+
import importlib
from codecs import IncrementalDecoder
from collections import Counter
from functools import lru_cache
-from typing import Counter as TypeCounter, Dict, List, Optional, Tuple
+from typing import Counter as TypeCounter
from .constant import (
FREQUENCIES,
@@ -22,26 +24,24 @@
)
-def encoding_unicode_range(iana_name: str) -> List[str]:
+def encoding_unicode_range(iana_name: str) -> list[str]:
"""
Return associated unicode ranges in a single byte code page.
"""
if is_multi_byte_encoding(iana_name):
- raise IOError("Function not supported on multi-byte code page")
+ raise OSError("Function not supported on multi-byte code page")
- decoder = importlib.import_module(
- "encodings.{}".format(iana_name)
- ).IncrementalDecoder
+ decoder = importlib.import_module(f"encodings.{iana_name}").IncrementalDecoder
p: IncrementalDecoder = decoder(errors="ignore")
- seen_ranges: Dict[str, int] = {}
+ seen_ranges: dict[str, int] = {}
character_count: int = 0
for i in range(0x40, 0xFF):
chunk: str = p.decode(bytes([i]))
if chunk:
- character_range: Optional[str] = unicode_range(chunk)
+ character_range: str | None = unicode_range(chunk)
if character_range is None:
continue
@@ -61,11 +61,11 @@ def encoding_unicode_range(iana_name: str) -> List[str]:
)
-def unicode_range_languages(primary_range: str) -> List[str]:
+def unicode_range_languages(primary_range: str) -> list[str]:
"""
Return inferred languages used with a unicode range.
"""
- languages: List[str] = []
+ languages: list[str] = []
for language, characters in FREQUENCIES.items():
for character in characters:
@@ -77,13 +77,13 @@ def unicode_range_languages(primary_range: str) -> List[str]:
@lru_cache()
-def encoding_languages(iana_name: str) -> List[str]:
+def encoding_languages(iana_name: str) -> list[str]:
"""
Single-byte encoding language association. Some code page are heavily linked to particular language(s).
This function does the correspondence.
"""
- unicode_ranges: List[str] = encoding_unicode_range(iana_name)
- primary_range: Optional[str] = None
+ unicode_ranges: list[str] = encoding_unicode_range(iana_name)
+ primary_range: str | None = None
for specified_range in unicode_ranges:
if "Latin" not in specified_range:
@@ -97,7 +97,7 @@ def encoding_languages(iana_name: str) -> List[str]:
@lru_cache()
-def mb_encoding_languages(iana_name: str) -> List[str]:
+def mb_encoding_languages(iana_name: str) -> list[str]:
"""
Multi-byte encoding language association. Some code page are heavily linked to particular language(s).
This function does the correspondence.
@@ -118,7 +118,7 @@ def mb_encoding_languages(iana_name: str) -> List[str]:
@lru_cache(maxsize=LANGUAGE_SUPPORTED_COUNT)
-def get_target_features(language: str) -> Tuple[bool, bool]:
+def get_target_features(language: str) -> tuple[bool, bool]:
"""
Determine main aspects from a supported language if it contains accents and if is pure Latin.
"""
@@ -135,12 +135,12 @@ def get_target_features(language: str) -> Tuple[bool, bool]:
def alphabet_languages(
- characters: List[str], ignore_non_latin: bool = False
-) -> List[str]:
+ characters: list[str], ignore_non_latin: bool = False
+) -> list[str]:
"""
Return associated languages associated to given characters.
"""
- languages: List[Tuple[str, float]] = []
+ languages: list[tuple[str, float]] = []
source_have_accents = any(is_accentuated(character) for character in characters)
@@ -170,7 +170,7 @@ def alphabet_languages(
def characters_popularity_compare(
- language: str, ordered_characters: List[str]
+ language: str, ordered_characters: list[str]
) -> float:
"""
Determine if a ordered characters list (by occurrence from most appearance to rarest) match a particular language.
@@ -178,7 +178,7 @@ def characters_popularity_compare(
Beware that is function is not strict on the match in order to ease the detection. (Meaning close match is 1.)
"""
if language not in FREQUENCIES:
- raise ValueError("{} not available".format(language))
+ raise ValueError(f"{language} not available")
character_approved_count: int = 0
FREQUENCIES_language_set = set(FREQUENCIES[language])
@@ -214,14 +214,14 @@ def characters_popularity_compare(
character_approved_count += 1
continue
- characters_before_source: List[str] = FREQUENCIES[language][
+ characters_before_source: list[str] = FREQUENCIES[language][
0:character_rank_in_language
]
- characters_after_source: List[str] = FREQUENCIES[language][
+ characters_after_source: list[str] = FREQUENCIES[language][
character_rank_in_language:
]
- characters_before: List[str] = ordered_characters[0:character_rank]
- characters_after: List[str] = ordered_characters[character_rank:]
+ characters_before: list[str] = ordered_characters[0:character_rank]
+ characters_after: list[str] = ordered_characters[character_rank:]
before_match_count: int = len(
set(characters_before) & set(characters_before_source)
@@ -249,24 +249,24 @@ def characters_popularity_compare(
return character_approved_count / len(ordered_characters)
-def alpha_unicode_split(decoded_sequence: str) -> List[str]:
+def alpha_unicode_split(decoded_sequence: str) -> list[str]:
"""
Given a decoded text sequence, return a list of str. Unicode range / alphabet separation.
Ex. a text containing English/Latin with a bit a Hebrew will return two items in the resulting list;
One containing the latin letters and the other hebrew.
"""
- layers: Dict[str, str] = {}
+ layers: dict[str, str] = {}
for character in decoded_sequence:
if character.isalpha() is False:
continue
- character_range: Optional[str] = unicode_range(character)
+ character_range: str | None = unicode_range(character)
if character_range is None:
continue
- layer_target_range: Optional[str] = None
+ layer_target_range: str | None = None
for discovered_range in layers:
if (
@@ -288,12 +288,12 @@ def alpha_unicode_split(decoded_sequence: str) -> List[str]:
return list(layers.values())
-def merge_coherence_ratios(results: List[CoherenceMatches]) -> CoherenceMatches:
+def merge_coherence_ratios(results: list[CoherenceMatches]) -> CoherenceMatches:
"""
This function merge results previously given by the function coherence_ratio.
The return type is the same as coherence_ratio.
"""
- per_language_ratios: Dict[str, List[float]] = {}
+ per_language_ratios: dict[str, list[float]] = {}
for result in results:
for sub_result in result:
language, ratio = sub_result
@@ -321,7 +321,7 @@ def filter_alt_coherence_matches(results: CoherenceMatches) -> CoherenceMatches:
We shall NOT return "English—" in CoherenceMatches because it is an alternative
of "English". This function only keeps the best match and remove the em-dash in it.
"""
- index_results: Dict[str, List[float]] = dict()
+ index_results: dict[str, list[float]] = dict()
for result in results:
language, ratio = result
@@ -345,14 +345,14 @@ def filter_alt_coherence_matches(results: CoherenceMatches) -> CoherenceMatches:
@lru_cache(maxsize=2048)
def coherence_ratio(
- decoded_sequence: str, threshold: float = 0.1, lg_inclusion: Optional[str] = None
+ decoded_sequence: str, threshold: float = 0.1, lg_inclusion: str | None = None
) -> CoherenceMatches:
"""
Detect ANY language that can be identified in given sequence. The sequence will be analysed by layers.
A layer = Character extraction by alphabets/ranges.
"""
- results: List[Tuple[str, float]] = []
+ results: list[tuple[str, float]] = []
ignore_non_latin: bool = False
sufficient_match_count: int = 0
@@ -371,7 +371,7 @@ def coherence_ratio(
if character_count <= TOO_SMALL_SEQUENCE:
continue
- popular_character_ordered: List[str] = [c for c, o in most_common]
+ popular_character_ordered: list[str] = [c for c, o in most_common]
for language in lg_inclusion_list or alphabet_languages(
popular_character_ordered, ignore_non_latin
diff --git a/charset_normalizer/cli/__init__.py b/src/charset_normalizer/cli/__init__.py
similarity index 73%
rename from charset_normalizer/cli/__init__.py
rename to src/charset_normalizer/cli/__init__.py
index d95fedf..543a5a4 100644
--- a/charset_normalizer/cli/__init__.py
+++ b/src/charset_normalizer/cli/__init__.py
@@ -1,3 +1,5 @@
+from __future__ import annotations
+
from .__main__ import cli_detect, query_yes_no
__all__ = (
diff --git a/charset_normalizer/cli/__main__.py b/src/charset_normalizer/cli/__main__.py
similarity index 71%
rename from charset_normalizer/cli/__main__.py
rename to src/charset_normalizer/cli/__main__.py
index f4bcbaa..cb64156 100644
--- a/charset_normalizer/cli/__main__.py
+++ b/src/charset_normalizer/cli/__main__.py
@@ -1,9 +1,11 @@
+from __future__ import annotations
+
import argparse
import sys
+import typing
from json import dumps
from os.path import abspath, basename, dirname, join, realpath
from platform import python_version
-from typing import List, Optional
from unicodedata import unidata_version
import charset_normalizer.md as md_module
@@ -42,10 +44,69 @@ def query_yes_no(question: str, default: str = "yes") -> bool:
elif choice in valid:
return valid[choice]
else:
- sys.stdout.write("Please respond with 'yes' or 'no' " "(or 'y' or 'n').\n")
+ sys.stdout.write("Please respond with 'yes' or 'no' (or 'y' or 'n').\n")
+
+
+class FileType:
+ """Factory for creating file object types
+ Instances of FileType are typically passed as type= arguments to the
+ ArgumentParser add_argument() method.
-def cli_detect(argv: Optional[List[str]] = None) -> int:
+ Keyword Arguments:
+ - mode -- A string indicating how the file is to be opened. Accepts the
+ same values as the builtin open() function.
+ - bufsize -- The file's desired buffer size. Accepts the same values as
+ the builtin open() function.
+ - encoding -- The file's encoding. Accepts the same values as the
+ builtin open() function.
+ - errors -- A string indicating how encoding and decoding errors are to
+ be handled. Accepts the same value as the builtin open() function.
+
+ Backported from CPython 3.12
+ """
+
+ def __init__(
+ self,
+ mode: str = "r",
+ bufsize: int = -1,
+ encoding: str | None = None,
+ errors: str | None = None,
+ ):
+ self._mode = mode
+ self._bufsize = bufsize
+ self._encoding = encoding
+ self._errors = errors
+
+ def __call__(self, string: str) -> typing.IO: # type: ignore[type-arg]
+ # the special argument "-" means sys.std{in,out}
+ if string == "-":
+ if "r" in self._mode:
+ return sys.stdin.buffer if "b" in self._mode else sys.stdin
+ elif any(c in self._mode for c in "wax"):
+ return sys.stdout.buffer if "b" in self._mode else sys.stdout
+ else:
+ msg = f'argument "-" with mode {self._mode}'
+ raise ValueError(msg)
+
+ # all other arguments are used as file names
+ try:
+ return open(string, self._mode, self._bufsize, self._encoding, self._errors)
+ except OSError as e:
+ message = f"can't open '{string}': {e}"
+ raise argparse.ArgumentTypeError(message)
+
+ def __repr__(self) -> str:
+ args = self._mode, self._bufsize
+ kwargs = [("encoding", self._encoding), ("errors", self._errors)]
+ args_str = ", ".join(
+ [repr(arg) for arg in args if arg != -1]
+ + [f"{kw}={arg!r}" for kw, arg in kwargs if arg is not None]
+ )
+ return f"{type(self).__name__}({args_str})"
+
+
+def cli_detect(argv: list[str] | None = None) -> int:
"""
CLI assistant using ARGV and ArgumentParser
:param argv:
@@ -58,7 +119,7 @@ def cli_detect(argv: Optional[List[str]] = None) -> int:
)
parser.add_argument(
- "files", type=argparse.FileType("rb"), nargs="+", help="File(s) to be analysed"
+ "files", type=FileType("rb"), nargs="+", help="File(s) to be analysed"
)
parser.add_argument(
"-v",
@@ -109,6 +170,14 @@ def cli_detect(argv: Optional[List[str]] = None) -> int:
dest="force",
help="Replace file without asking if you are sure, use this flag with caution.",
)
+ parser.add_argument(
+ "-i",
+ "--no-preemptive",
+ action="store_true",
+ default=False,
+ dest="no_preemptive",
+ help="Disable looking at a charset declaration to hint the detector.",
+ )
parser.add_argument(
"-t",
"--threshold",
@@ -116,7 +185,7 @@ def cli_detect(argv: Optional[List[str]] = None) -> int:
default=0.2,
type=float,
dest="threshold",
- help="Define a custom maximum amount of chaos allowed in decoded content. 0. <= chaos <= 1.",
+ help="Define a custom maximum amount of noise allowed in decoded content. 0. <= noise <= 1.",
)
parser.add_argument(
"--version",
@@ -133,21 +202,35 @@ def cli_detect(argv: Optional[List[str]] = None) -> int:
args = parser.parse_args(argv)
if args.replace is True and args.normalize is False:
+ if args.files:
+ for my_file in args.files:
+ my_file.close()
print("Use --replace in addition of --normalize only.", file=sys.stderr)
return 1
if args.force is True and args.replace is False:
+ if args.files:
+ for my_file in args.files:
+ my_file.close()
print("Use --force in addition of --replace only.", file=sys.stderr)
return 1
if args.threshold < 0.0 or args.threshold > 1.0:
+ if args.files:
+ for my_file in args.files:
+ my_file.close()
print("--threshold VALUE should be between 0. AND 1.", file=sys.stderr)
return 1
x_ = []
for my_file in args.files:
- matches = from_fp(my_file, threshold=args.threshold, explain=args.verbose)
+ matches = from_fp(
+ my_file,
+ threshold=args.threshold,
+ explain=args.verbose,
+ preemptive_behaviour=args.no_preemptive is False,
+ )
best_guess = matches.best()
@@ -155,9 +238,11 @@ def cli_detect(argv: Optional[List[str]] = None) -> int:
print(
'Unable to identify originating encoding for "{}". {}'.format(
my_file.name,
- "Maybe try increasing maximum amount of chaos."
- if args.threshold < 1.0
- else "",
+ (
+ "Maybe try increasing maximum amount of chaos."
+ if args.threshold < 1.0
+ else ""
+ ),
),
file=sys.stderr,
)
@@ -235,7 +320,7 @@ def cli_detect(argv: Optional[List[str]] = None) -> int:
dir_path = dirname(realpath(my_file.name))
file_name = basename(realpath(my_file.name))
- o_: List[str] = file_name.split(".")
+ o_: list[str] = file_name.split(".")
if args.replace is False:
o_.insert(-1, best_guess.encoding)
@@ -258,9 +343,9 @@ def cli_detect(argv: Optional[List[str]] = None) -> int:
try:
x_[0].unicode_path = join(dir_path, ".".join(o_))
- with open(x_[0].unicode_path, "w", encoding="utf-8") as fp:
- fp.write(str(best_guess))
- except IOError as e:
+ with open(x_[0].unicode_path, "wb") as fp:
+ fp.write(best_guess.output())
+ except OSError as e:
print(str(e), file=sys.stderr)
if my_file.closed is False:
my_file.close()
diff --git a/charset_normalizer/constant.py b/src/charset_normalizer/constant.py
similarity index 93%
rename from charset_normalizer/constant.py
rename to src/charset_normalizer/constant.py
index 8634904..cc71a01 100644
--- a/charset_normalizer/constant.py
+++ b/src/charset_normalizer/constant.py
@@ -1,11 +1,12 @@
-# -*- coding: utf-8 -*-
+from __future__ import annotations
+
from codecs import BOM_UTF8, BOM_UTF16_BE, BOM_UTF16_LE, BOM_UTF32_BE, BOM_UTF32_LE
from encodings.aliases import aliases
-from re import IGNORECASE, compile as re_compile
-from typing import Dict, List, Set, Union
+from re import IGNORECASE
+from re import compile as re_compile
# Contain for each eligible encoding a list of/item bytes SIG/BOM
-ENCODING_MARKS: Dict[str, Union[bytes, List[bytes]]] = {
+ENCODING_MARKS: dict[str, bytes | list[bytes]] = {
"utf_8": BOM_UTF8,
"utf_7": [
b"\x2b\x2f\x76\x38",
@@ -25,7 +26,7 @@
UTF8_MAXIMAL_ALLOCATION: int = 1_112_064
# Up-to-date Unicode ucd/15.0.0
-UNICODE_RANGES_COMBINED: Dict[str, range] = {
+UNICODE_RANGES_COMBINED: dict[str, range] = {
"Control character": range(32),
"Basic Latin": range(32, 128),
"Latin-1 Supplement": range(128, 256),
@@ -357,7 +358,7 @@
}
-UNICODE_SECONDARY_RANGE_KEYWORD: List[str] = [
+UNICODE_SECONDARY_RANGE_KEYWORD: list[str] = [
"Supplement",
"Extended",
"Extensions",
@@ -392,7 +393,7 @@
"koi8_u",
]
-IANA_SUPPORTED: List[str] = sorted(
+IANA_SUPPORTED: list[str] = sorted(
filter(
lambda x: x.endswith("_codec") is False
and x not in {"rot_13", "tactis", "mbcs"},
@@ -403,7 +404,7 @@
IANA_SUPPORTED_COUNT: int = len(IANA_SUPPORTED)
# pre-computed code page that are similar using the function cp_similarity.
-IANA_SUPPORTED_SIMILAR: Dict[str, List[str]] = {
+IANA_SUPPORTED_SIMILAR: dict[str, list[str]] = {
"cp037": ["cp1026", "cp1140", "cp273", "cp500"],
"cp1026": ["cp037", "cp1140", "cp273", "cp500"],
"cp1125": ["cp866"],
@@ -492,7 +493,7 @@
}
-CHARDET_CORRESPONDENCE: Dict[str, str] = {
+CHARDET_CORRESPONDENCE: dict[str, str] = {
"iso2022_kr": "ISO-2022-KR",
"iso2022_jp": "ISO-2022-JP",
"euc_kr": "EUC-KR",
@@ -528,7 +529,7 @@
}
-COMMON_SAFE_ASCII_CHARACTERS: Set[str] = {
+COMMON_SAFE_ASCII_CHARACTERS: set[str] = {
"<",
">",
"=",
@@ -544,11 +545,30 @@
"|",
'"',
"-",
+ "(",
+ ")",
}
+# Sample character sets — replace with full lists if needed
+COMMON_CHINESE_CHARACTERS = "的一是在不了有和人这中大为上个国我以要他时来用们生到作地于出就分对成会可主发年动同工也能下过子说产种面而方后多定行学法所民得经十三之进着等部度家电力里如水化高自二理起小物现实加量都两体制机当使点从业本去把性好应开它合还因由其些然前外天政四日那社义事平形相全表间样与关各重新线内数正心反你明看原又么利比或但质气第向道命此变条只没结解问意建月公无系军很情者最立代想已通并提直题党程展五果料象员革位入常文总次品式活设及管特件长求老头基资边流路级少图山统接知较将组见计别她手角期根论运农指几九区强放决西被干做必战先回则任取据处队南给色光门即保治北造百规热领七海口东导器压志世金增争济阶油思术极交受联什认六共权收证改清己美再采转更单风切打白教速花带安场身车例真务具万每目至达走积示议声报斗完类八离华名确才科张信马节话米整空元况今集温传土许步群广石记需段研界拉林律叫且究观越织装影算低持音众书布复容儿须际商非验连断深难近矿千周委素技备半办青省列习响约支般史感劳便团往酸历市克何除消构府太准精值号率族维划选标写存候毛亲快效斯院查江型眼王按格养易置派层片始却专状育厂京识适属圆包火住调满县局照参红细引听该铁价严龙飞"
+
+COMMON_JAPANESE_CHARACTERS = "日一国年大十二本中長出三時行見月分後前生五間上東四今金九入学高円子外八六下来気小七山話女北午百書先名川千水半男西電校語土木聞食車何南万毎白天母火右読友左休父雨"
+
+COMMON_KOREAN_CHARACTERS = "一二三四五六七八九十百千萬上下左右中人女子大小山川日月火水木金土父母天地國名年時文校學生"
+
+# Combine all into a set
+COMMON_CJK_CHARACTERS = set(
+ "".join(
+ [
+ COMMON_CHINESE_CHARACTERS,
+ COMMON_JAPANESE_CHARACTERS,
+ COMMON_KOREAN_CHARACTERS,
+ ]
+ )
+)
-KO_NAMES: Set[str] = {"johab", "cp949", "euc_kr"}
-ZH_NAMES: Set[str] = {"big5", "cp950", "big5hkscs", "hz"}
+KO_NAMES: set[str] = {"johab", "cp949", "euc_kr"}
+ZH_NAMES: set[str] = {"big5", "cp950", "big5hkscs", "hz"}
# Logging LEVEL below DEBUG
TRACE: int = 5
@@ -556,7 +576,7 @@
# Language label that contain the em dash "—"
# character are to be considered alternative seq to origin
-FREQUENCIES: Dict[str, List[str]] = {
+FREQUENCIES: dict[str, list[str]] = {
"English": [
"e",
"a",
diff --git a/charset_normalizer/legacy.py b/src/charset_normalizer/legacy.py
similarity index 81%
rename from charset_normalizer/legacy.py
rename to src/charset_normalizer/legacy.py
index 43aad21..e221bec 100644
--- a/charset_normalizer/legacy.py
+++ b/src/charset_normalizer/legacy.py
@@ -1,13 +1,24 @@
-from typing import Any, Dict, Optional, Union
+from __future__ import annotations
+
+from typing import TYPE_CHECKING, Any
from warnings import warn
from .api import from_bytes
from .constant import CHARDET_CORRESPONDENCE
+# TODO: remove this check when dropping Python 3.7 support
+if TYPE_CHECKING:
+ from typing_extensions import TypedDict
+
+ class ResultDict(TypedDict):
+ encoding: str | None
+ language: str
+ confidence: float | None
+
def detect(
byte_str: bytes, should_rename_legacy: bool = False, **kwargs: Any
-) -> Dict[str, Optional[Union[str, float]]]:
+) -> ResultDict:
"""
chardet legacy method
Detect the encoding of the given byte string. It should be mostly backward-compatible.
@@ -26,8 +37,7 @@ def detect(
if not isinstance(byte_str, (bytearray, bytes)):
raise TypeError( # pragma: nocover
- "Expected object of type bytes or bytearray, got: "
- "{0}".format(type(byte_str))
+ f"Expected object of type bytes or bytearray, got: {type(byte_str)}"
)
if isinstance(byte_str, bytearray):
diff --git a/charset_normalizer/md.py b/src/charset_normalizer/md.py
similarity index 87%
rename from charset_normalizer/md.py
rename to src/charset_normalizer/md.py
index 77897aa..12ce024 100644
--- a/charset_normalizer/md.py
+++ b/src/charset_normalizer/md.py
@@ -1,6 +1,7 @@
+from __future__ import annotations
+
from functools import lru_cache
from logging import getLogger
-from typing import List, Optional
from .constant import (
COMMON_SAFE_ASCII_CHARACTERS,
@@ -25,6 +26,7 @@
is_unprintable,
remove_accent,
unicode_range,
+ is_cjk_uncommon,
)
@@ -68,7 +70,7 @@ def __init__(self) -> None:
self._symbol_count: int = 0
self._character_count: int = 0
- self._last_printable_char: Optional[str] = None
+ self._last_printable_char: str | None = None
self._frenzy_symbol_in_word: bool = False
def eligible(self, character: str) -> bool:
@@ -92,7 +94,7 @@ def feed(self, character: str) -> None:
self._last_printable_char = character
- def reset(self) -> None: # pragma: no cover
+ def reset(self) -> None: # Abstract
self._punctuation_count = 0
self._character_count = 0
self._symbol_count = 0
@@ -123,7 +125,7 @@ def feed(self, character: str) -> None:
if is_accentuated(character):
self._accentuated_count += 1
- def reset(self) -> None: # pragma: no cover
+ def reset(self) -> None: # Abstract
self._character_count = 0
self._accentuated_count = 0
@@ -149,7 +151,7 @@ def feed(self, character: str) -> None:
self._unprintable_count += 1
self._character_count += 1
- def reset(self) -> None: # pragma: no cover
+ def reset(self) -> None: # Abstract
self._unprintable_count = 0
@property
@@ -165,7 +167,7 @@ def __init__(self) -> None:
self._successive_count: int = 0
self._character_count: int = 0
- self._last_latin_character: Optional[str] = None
+ self._last_latin_character: str | None = None
def eligible(self, character: str) -> bool:
return character.isalpha() and is_latin(character)
@@ -184,7 +186,7 @@ def feed(self, character: str) -> None:
self._successive_count += 1
self._last_latin_character = character
- def reset(self) -> None: # pragma: no cover
+ def reset(self) -> None: # Abstract
self._successive_count = 0
self._character_count = 0
self._last_latin_character = None
@@ -201,7 +203,7 @@ class SuspiciousRange(MessDetectorPlugin):
def __init__(self) -> None:
self._suspicious_successive_range_count: int = 0
self._character_count: int = 0
- self._last_printable_seen: Optional[str] = None
+ self._last_printable_seen: str | None = None
def eligible(self, character: str) -> bool:
return character.isprintable()
@@ -221,22 +223,22 @@ def feed(self, character: str) -> None:
self._last_printable_seen = character
return
- unicode_range_a: Optional[str] = unicode_range(self._last_printable_seen)
- unicode_range_b: Optional[str] = unicode_range(character)
+ unicode_range_a: str | None = unicode_range(self._last_printable_seen)
+ unicode_range_b: str | None = unicode_range(character)
if is_suspiciously_successive_range(unicode_range_a, unicode_range_b):
self._suspicious_successive_range_count += 1
self._last_printable_seen = character
- def reset(self) -> None: # pragma: no cover
+ def reset(self) -> None: # Abstract
self._character_count = 0
self._suspicious_successive_range_count = 0
self._last_printable_seen = None
@property
def ratio(self) -> float:
- if self._character_count <= 24:
+ if self._character_count <= 13:
return 0.0
ratio_of_suspicious_range_usage: float = (
@@ -260,6 +262,7 @@ def __init__(self) -> None:
self._buffer: str = ""
self._buffer_accent_count: int = 0
+ self._buffer_glyph_count: int = 0
def eligible(self, character: str) -> bool:
return True
@@ -279,6 +282,14 @@ def feed(self, character: str) -> None:
and is_thai(character) is False
):
self._foreign_long_watch = True
+ if (
+ is_cjk(character)
+ or is_hangul(character)
+ or is_katakana(character)
+ or is_hiragana(character)
+ or is_thai(character)
+ ):
+ self._buffer_glyph_count += 1
return
if not self._buffer:
return
@@ -291,17 +302,20 @@ def feed(self, character: str) -> None:
self._character_count += buffer_length
if buffer_length >= 4:
- if self._buffer_accent_count / buffer_length > 0.34:
+ if self._buffer_accent_count / buffer_length >= 0.5:
self._is_current_word_bad = True
# Word/Buffer ending with an upper case accentuated letter are so rare,
# that we will consider them all as suspicious. Same weight as foreign_long suspicious.
- if (
+ elif (
is_accentuated(self._buffer[-1])
and self._buffer[-1].isupper()
and all(_.isupper() for _ in self._buffer) is False
):
self._foreign_long_count += 1
self._is_current_word_bad = True
+ elif self._buffer_glyph_count == 1:
+ self._is_current_word_bad = True
+ self._foreign_long_count += 1
if buffer_length >= 24 and self._foreign_long_watch:
camel_case_dst = [
i
@@ -325,6 +339,7 @@ def feed(self, character: str) -> None:
self._foreign_long_watch = False
self._buffer = ""
self._buffer_accent_count = 0
+ self._buffer_glyph_count = 0
elif (
character not in {"<", ">", "-", "=", "~", "|", "_"}
and character.isdigit() is False
@@ -333,7 +348,7 @@ def feed(self, character: str) -> None:
self._is_current_word_bad = True
self._buffer += character
- def reset(self) -> None: # pragma: no cover
+ def reset(self) -> None: # Abstract
self._buffer = ""
self._is_current_word_bad = False
self._foreign_long_watch = False
@@ -351,35 +366,39 @@ def ratio(self) -> float:
return self._bad_character_count / self._character_count
-class CjkInvalidStopPlugin(MessDetectorPlugin):
+class CjkUncommonPlugin(MessDetectorPlugin):
"""
- GB(Chinese) based encoding often render the stop incorrectly when the content does not fit and
- can be easily detected. Searching for the overuse of '丅' and '丄'.
+ Detect messy CJK text that probably means nothing.
"""
def __init__(self) -> None:
- self._wrong_stop_count: int = 0
- self._cjk_character_count: int = 0
+ self._character_count: int = 0
+ self._uncommon_count: int = 0
def eligible(self, character: str) -> bool:
- return True
+ return is_cjk(character)
def feed(self, character: str) -> None:
- if character in {"丅", "丄"}:
- self._wrong_stop_count += 1
+ self._character_count += 1
+
+ if is_cjk_uncommon(character):
+ self._uncommon_count += 1
return
- if is_cjk(character):
- self._cjk_character_count += 1
- def reset(self) -> None: # pragma: no cover
- self._wrong_stop_count = 0
- self._cjk_character_count = 0
+ def reset(self) -> None: # Abstract
+ self._character_count = 0
+ self._uncommon_count = 0
@property
def ratio(self) -> float:
- if self._cjk_character_count < 16:
+ if self._character_count < 8:
return 0.0
- return self._wrong_stop_count / self._cjk_character_count
+
+ uncommon_form_usage: float = self._uncommon_count / self._character_count
+
+ # we can be pretty sure it's garbage when uncommon characters are widely
+ # used. otherwise it could just be traditional chinese for example.
+ return uncommon_form_usage / 10 if uncommon_form_usage > 0.5 else 0.0
class ArchaicUpperLowerPlugin(MessDetectorPlugin):
@@ -393,7 +412,7 @@ def __init__(self) -> None:
self._character_count: int = 0
- self._last_alpha_seen: Optional[str] = None
+ self._last_alpha_seen: str | None = None
self._current_ascii_only: bool = True
def eligible(self, character: str) -> bool:
@@ -441,7 +460,7 @@ def feed(self, character: str) -> None:
self._character_count_since_last_sep += 1
self._last_alpha_seen = character
- def reset(self) -> None: # pragma: no cover
+ def reset(self) -> None: # Abstract
self._character_count = 0
self._character_count_since_last_sep = 0
self._successive_upper_lower_count = 0
@@ -463,7 +482,7 @@ def __init__(self) -> None:
self._character_count: int = 0
self._isolated_form_count: int = 0
- def reset(self) -> None: # pragma: no cover
+ def reset(self) -> None: # Abstract
self._character_count = 0
self._isolated_form_count = 0
@@ -488,7 +507,7 @@ def ratio(self) -> float:
@lru_cache(maxsize=1024)
def is_suspiciously_successive_range(
- unicode_range_a: Optional[str], unicode_range_b: Optional[str]
+ unicode_range_a: str | None, unicode_range_b: str | None
) -> bool:
"""
Determine if two Unicode range seen next to each other can be considered as suspicious.
@@ -512,9 +531,10 @@ def is_suspiciously_successive_range(
):
return False
- keywords_range_a, keywords_range_b = unicode_range_a.split(
- " "
- ), unicode_range_b.split(" ")
+ keywords_range_a, keywords_range_b = (
+ unicode_range_a.split(" "),
+ unicode_range_b.split(" "),
+ )
for el in keywords_range_a:
if el in UNICODE_SECONDARY_RANGE_KEYWORD:
@@ -567,7 +587,7 @@ def mess_ratio(
Compute a mess ratio given a decoded bytes sequence. The maximum threshold does stop the computation earlier.
"""
- detectors: List[MessDetectorPlugin] = [
+ detectors: list[MessDetectorPlugin] = [
md_class() for md_class in MessDetectorPlugin.__subclasses__()
]
@@ -609,7 +629,7 @@ def mess_ratio(
logger.log(TRACE, f"Starting with: {decoded_sequence[:16]}")
logger.log(TRACE, f"Ending with: {decoded_sequence[-16::]}")
- for dt in detectors: # pragma: nocover
+ for dt in detectors:
logger.log(TRACE, f"{dt.__class__}: {dt.ratio}")
return round(mean_mess_ratio, 3)
diff --git a/charset_normalizer/models.py b/src/charset_normalizer/models.py
similarity index 77%
rename from charset_normalizer/models.py
rename to src/charset_normalizer/models.py
index a760b9c..1042758 100644
--- a/charset_normalizer/models.py
+++ b/src/charset_normalizer/models.py
@@ -1,9 +1,12 @@
+from __future__ import annotations
+
from encodings.aliases import aliases
from hashlib import sha256
from json import dumps
-from typing import Any, Dict, Iterator, List, Optional, Tuple, Union
+from re import sub
+from typing import Any, Iterator, List, Tuple
-from .constant import TOO_BIG_SEQUENCE
+from .constant import RE_POSSIBLE_ENCODING_INDICATION, TOO_BIG_SEQUENCE
from .utils import iana_name, is_multi_byte_encoding, unicode_range
@@ -14,8 +17,9 @@ def __init__(
guessed_encoding: str,
mean_mess_ratio: float,
has_sig_or_bom: bool,
- languages: "CoherenceMatches",
- decoded_payload: Optional[str] = None,
+ languages: CoherenceMatches,
+ decoded_payload: str | None = None,
+ preemptive_declaration: str | None = None,
):
self._payload: bytes = payload
@@ -23,23 +27,23 @@ def __init__(
self._mean_mess_ratio: float = mean_mess_ratio
self._languages: CoherenceMatches = languages
self._has_sig_or_bom: bool = has_sig_or_bom
- self._unicode_ranges: Optional[List[str]] = None
+ self._unicode_ranges: list[str] | None = None
- self._leaves: List[CharsetMatch] = []
+ self._leaves: list[CharsetMatch] = []
self._mean_coherence_ratio: float = 0.0
- self._output_payload: Optional[bytes] = None
- self._output_encoding: Optional[str] = None
+ self._output_payload: bytes | None = None
+ self._output_encoding: str | None = None
+
+ self._string: str | None = decoded_payload
- self._string: Optional[str] = decoded_payload
+ self._preemptive_declaration: str | None = preemptive_declaration
def __eq__(self, other: object) -> bool:
if not isinstance(other, CharsetMatch):
- raise TypeError(
- "__eq__ cannot be invoked on {} and {}.".format(
- str(other.__class__), str(self.__class__)
- )
- )
+ if isinstance(other, str):
+ return iana_name(other) == self.encoding
+ return False
return self.encoding == other.encoding and self.fingerprint == other.fingerprint
def __lt__(self, other: object) -> bool:
@@ -75,9 +79,9 @@ def __str__(self) -> str:
return self._string
def __repr__(self) -> str:
- return "".format(self.encoding, self.fingerprint)
+ return f""
- def add_submatch(self, other: "CharsetMatch") -> None:
+ def add_submatch(self, other: CharsetMatch) -> None:
if not isinstance(other, CharsetMatch) or other == self:
raise ValueError(
"Unable to add instance <{}> as a submatch of a CharsetMatch".format(
@@ -93,11 +97,11 @@ def encoding(self) -> str:
return self._encoding
@property
- def encoding_aliases(self) -> List[str]:
+ def encoding_aliases(self) -> list[str]:
"""
Encoding name are known by many name, using this could help when searching for IBM855 when it's listed as CP855.
"""
- also_known_as: List[str] = []
+ also_known_as: list[str] = []
for u, p in aliases.items():
if self.encoding == u:
also_known_as.append(p)
@@ -114,7 +118,7 @@ def byte_order_mark(self) -> bool:
return self._has_sig_or_bom
@property
- def languages(self) -> List[str]:
+ def languages(self) -> list[str]:
"""
Return the complete list of possible languages found in decoded sequence.
Usually not really useful. Returned list may be empty even if 'language' property return something != 'Unknown'.
@@ -175,7 +179,7 @@ def raw(self) -> bytes:
return self._payload
@property
- def submatch(self) -> List["CharsetMatch"]:
+ def submatch(self) -> list[CharsetMatch]:
return self._leaves
@property
@@ -183,19 +187,17 @@ def has_submatch(self) -> bool:
return len(self._leaves) > 0
@property
- def alphabets(self) -> List[str]:
+ def alphabets(self) -> list[str]:
if self._unicode_ranges is not None:
return self._unicode_ranges
# list detected ranges
- detected_ranges: List[Optional[str]] = [
- unicode_range(char) for char in str(self)
- ]
+ detected_ranges: list[str | None] = [unicode_range(char) for char in str(self)]
# filter and sort
self._unicode_ranges = sorted(list({r for r in detected_ranges if r}))
return self._unicode_ranges
@property
- def could_be_from_charset(self) -> List[str]:
+ def could_be_from_charset(self) -> list[str]:
"""
The complete list of encoding that output the exact SAME str result and therefore could be the originating
encoding.
@@ -210,7 +212,25 @@ def output(self, encoding: str = "utf_8") -> bytes:
"""
if self._output_encoding is None or self._output_encoding != encoding:
self._output_encoding = encoding
- self._output_payload = str(self).encode(encoding, "replace")
+ decoded_string = str(self)
+ if (
+ self._preemptive_declaration is not None
+ and self._preemptive_declaration.lower()
+ not in ["utf-8", "utf8", "utf_8"]
+ ):
+ patched_header = sub(
+ RE_POSSIBLE_ENCODING_INDICATION,
+ lambda m: m.string[m.span()[0] : m.span()[1]].replace(
+ m.groups()[0],
+ iana_name(self._output_encoding).replace("_", "-"), # type: ignore[arg-type]
+ ),
+ decoded_string[:8192],
+ count=1,
+ )
+
+ decoded_string = patched_header + decoded_string[8192:]
+
+ self._output_payload = decoded_string.encode(encoding, "replace")
return self._output_payload # type: ignore
@@ -228,13 +248,13 @@ class CharsetMatches:
Act like a list(iterable) but does not implements all related methods.
"""
- def __init__(self, results: Optional[List[CharsetMatch]] = None):
- self._results: List[CharsetMatch] = sorted(results) if results else []
+ def __init__(self, results: list[CharsetMatch] | None = None):
+ self._results: list[CharsetMatch] = sorted(results) if results else []
def __iter__(self) -> Iterator[CharsetMatch]:
yield from self._results
- def __getitem__(self, item: Union[int, str]) -> CharsetMatch:
+ def __getitem__(self, item: int | str) -> CharsetMatch:
"""
Retrieve a single item either by its position or encoding name (alias may be used here).
Raise KeyError upon invalid index or encoding not present in results.
@@ -266,7 +286,7 @@ def append(self, item: CharsetMatch) -> None:
)
)
# We should disable the submatch factoring when the input file is too heavy (conserve RAM usage)
- if len(item.raw) <= TOO_BIG_SEQUENCE:
+ if len(item.raw) < TOO_BIG_SEQUENCE:
for match in self._results:
if match.fingerprint == item.fingerprint and match.chaos == item.chaos:
match.add_submatch(item)
@@ -274,7 +294,7 @@ def append(self, item: CharsetMatch) -> None:
self._results.append(item)
self._results = sorted(self._results)
- def best(self) -> Optional["CharsetMatch"]:
+ def best(self) -> CharsetMatch | None:
"""
Simply return the first match. Strict equivalent to matches[0].
"""
@@ -282,7 +302,7 @@ def best(self) -> Optional["CharsetMatch"]:
return None
return self._results[0]
- def first(self) -> Optional["CharsetMatch"]:
+ def first(self) -> CharsetMatch | None:
"""
Redundant method, call the method best(). Kept for BC reasons.
"""
@@ -297,31 +317,31 @@ class CliDetectionResult:
def __init__(
self,
path: str,
- encoding: Optional[str],
- encoding_aliases: List[str],
- alternative_encodings: List[str],
+ encoding: str | None,
+ encoding_aliases: list[str],
+ alternative_encodings: list[str],
language: str,
- alphabets: List[str],
+ alphabets: list[str],
has_sig_or_bom: bool,
chaos: float,
coherence: float,
- unicode_path: Optional[str],
+ unicode_path: str | None,
is_preferred: bool,
):
self.path: str = path
- self.unicode_path: Optional[str] = unicode_path
- self.encoding: Optional[str] = encoding
- self.encoding_aliases: List[str] = encoding_aliases
- self.alternative_encodings: List[str] = alternative_encodings
+ self.unicode_path: str | None = unicode_path
+ self.encoding: str | None = encoding
+ self.encoding_aliases: list[str] = encoding_aliases
+ self.alternative_encodings: list[str] = alternative_encodings
self.language: str = language
- self.alphabets: List[str] = alphabets
+ self.alphabets: list[str] = alphabets
self.has_sig_or_bom: bool = has_sig_or_bom
self.chaos: float = chaos
self.coherence: float = coherence
self.is_preferred: bool = is_preferred
@property
- def __dict__(self) -> Dict[str, Any]: # type: ignore
+ def __dict__(self) -> dict[str, Any]: # type: ignore
return {
"path": self.path,
"encoding": self.encoding,
diff --git a/charset_normalizer/py.typed b/src/charset_normalizer/py.typed
similarity index 100%
rename from charset_normalizer/py.typed
rename to src/charset_normalizer/py.typed
diff --git a/charset_normalizer/utils.py b/src/charset_normalizer/utils.py
similarity index 84%
rename from charset_normalizer/utils.py
rename to src/charset_normalizer/utils.py
index e5cbbf4..6bf0384 100644
--- a/charset_normalizer/utils.py
+++ b/src/charset_normalizer/utils.py
@@ -1,3 +1,5 @@
+from __future__ import annotations
+
import importlib
import logging
import unicodedata
@@ -5,9 +7,11 @@
from encodings.aliases import aliases
from functools import lru_cache
from re import findall
-from typing import Generator, List, Optional, Set, Tuple, Union
+from typing import Generator
-from _multibytecodec import MultibyteIncrementalDecoder
+from _multibytecodec import ( # type: ignore[import-not-found,import]
+ MultibyteIncrementalDecoder,
+)
from .constant import (
ENCODING_MARKS,
@@ -16,6 +20,7 @@
UNICODE_RANGES_COMBINED,
UNICODE_SECONDARY_RANGE_KEYWORD,
UTF8_MAXIMAL_ALLOCATION,
+ COMMON_CJK_CHARACTERS,
)
@@ -23,7 +28,7 @@
def is_accentuated(character: str) -> bool:
try:
description: str = unicodedata.name(character)
- except ValueError:
+ except ValueError: # Defensive: unicode database outdated?
return False
return (
"WITH GRAVE" in description
@@ -43,13 +48,13 @@ def remove_accent(character: str) -> str:
if not decomposed:
return character
- codes: List[str] = decomposed.split(" ")
+ codes: list[str] = decomposed.split(" ")
return chr(int(codes[0], 16))
@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
-def unicode_range(character: str) -> Optional[str]:
+def unicode_range(character: str) -> str | None:
"""
Retrieve the Unicode range official name from a single character.
"""
@@ -66,7 +71,7 @@ def unicode_range(character: str) -> Optional[str]:
def is_latin(character: str) -> bool:
try:
description: str = unicodedata.name(character)
- except ValueError:
+ except ValueError: # Defensive: unicode database outdated?
return False
return "LATIN" in description
@@ -78,7 +83,7 @@ def is_punctuation(character: str) -> bool:
if "P" in character_category:
return True
- character_range: Optional[str] = unicode_range(character)
+ character_range: str | None = unicode_range(character)
if character_range is None:
return False
@@ -93,7 +98,7 @@ def is_symbol(character: str) -> bool:
if "S" in character_category or "N" in character_category:
return True
- character_range: Optional[str] = unicode_range(character)
+ character_range: str | None = unicode_range(character)
if character_range is None:
return False
@@ -103,7 +108,7 @@ def is_symbol(character: str) -> bool:
@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
def is_emoticon(character: str) -> bool:
- character_range: Optional[str] = unicode_range(character)
+ character_range: str | None = unicode_range(character)
if character_range is None:
return False
@@ -130,7 +135,7 @@ def is_case_variable(character: str) -> bool:
def is_cjk(character: str) -> bool:
try:
character_name = unicodedata.name(character)
- except ValueError:
+ except ValueError: # Defensive: unicode database outdated?
return False
return "CJK" in character_name
@@ -140,7 +145,7 @@ def is_cjk(character: str) -> bool:
def is_hiragana(character: str) -> bool:
try:
character_name = unicodedata.name(character)
- except ValueError:
+ except ValueError: # Defensive: unicode database outdated?
return False
return "HIRAGANA" in character_name
@@ -150,7 +155,7 @@ def is_hiragana(character: str) -> bool:
def is_katakana(character: str) -> bool:
try:
character_name = unicodedata.name(character)
- except ValueError:
+ except ValueError: # Defensive: unicode database outdated?
return False
return "KATAKANA" in character_name
@@ -160,7 +165,7 @@ def is_katakana(character: str) -> bool:
def is_hangul(character: str) -> bool:
try:
character_name = unicodedata.name(character)
- except ValueError:
+ except ValueError: # Defensive: unicode database outdated?
return False
return "HANGUL" in character_name
@@ -170,7 +175,7 @@ def is_hangul(character: str) -> bool:
def is_thai(character: str) -> bool:
try:
character_name = unicodedata.name(character)
- except ValueError:
+ except ValueError: # Defensive: unicode database outdated?
return False
return "THAI" in character_name
@@ -180,7 +185,7 @@ def is_thai(character: str) -> bool:
def is_arabic(character: str) -> bool:
try:
character_name = unicodedata.name(character)
- except ValueError:
+ except ValueError: # Defensive: unicode database outdated?
return False
return "ARABIC" in character_name
@@ -190,12 +195,17 @@ def is_arabic(character: str) -> bool:
def is_arabic_isolated_form(character: str) -> bool:
try:
character_name = unicodedata.name(character)
- except ValueError:
+ except ValueError: # Defensive: unicode database outdated?
return False
return "ARABIC" in character_name and "ISOLATED FORM" in character_name
+@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
+def is_cjk_uncommon(character: str) -> bool:
+ return character not in COMMON_CJK_CHARACTERS
+
+
@lru_cache(maxsize=len(UNICODE_RANGES_COMBINED))
def is_unicode_range_secondary(range_name: str) -> bool:
return any(keyword in range_name for keyword in UNICODE_SECONDARY_RANGE_KEYWORD)
@@ -206,13 +216,13 @@ def is_unprintable(character: str) -> bool:
return (
character.isspace() is False # includes \n \t \r \v
and character.isprintable() is False
- and character != "\x1A" # Why? Its the ASCII substitute character.
+ and character != "\x1a" # Why? Its the ASCII substitute character.
and character != "\ufeff" # bug discovered in Python,
# Zero Width No-Break Space located in Arabic Presentation Forms-B, Unicode 1.1 not acknowledged as space.
)
-def any_specified_encoding(sequence: bytes, search_zone: int = 8192) -> Optional[str]:
+def any_specified_encoding(sequence: bytes, search_zone: int = 8192) -> str | None:
"""
Extract using ASCII-only decoder any specified encoding in the first n-bytes.
"""
@@ -221,7 +231,7 @@ def any_specified_encoding(sequence: bytes, search_zone: int = 8192) -> Optional
seq_len: int = len(sequence)
- results: List[str] = findall(
+ results: list[str] = findall(
RE_POSSIBLE_ENCODING_INDICATION,
sequence[: min(seq_len, search_zone)].decode("ascii", errors="ignore"),
)
@@ -260,18 +270,18 @@ def is_multi_byte_encoding(name: str) -> bool:
"utf_32_be",
"utf_7",
} or issubclass(
- importlib.import_module("encodings.{}".format(name)).IncrementalDecoder,
+ importlib.import_module(f"encodings.{name}").IncrementalDecoder,
MultibyteIncrementalDecoder,
)
-def identify_sig_or_bom(sequence: bytes) -> Tuple[Optional[str], bytes]:
+def identify_sig_or_bom(sequence: bytes) -> tuple[str | None, bytes]:
"""
Identify and extract SIG/BOM in given sequence.
"""
for iana_encoding in ENCODING_MARKS:
- marks: Union[bytes, List[bytes]] = ENCODING_MARKS[iana_encoding]
+ marks: bytes | list[bytes] = ENCODING_MARKS[iana_encoding]
if isinstance(marks, bytes):
marks = [marks]
@@ -288,6 +298,7 @@ def should_strip_sig_or_bom(iana_encoding: str) -> bool:
def iana_name(cp_name: str, strict: bool = True) -> str:
+ """Returns the Python normalized encoding name (Not the IANA official name)."""
cp_name = cp_name.lower().replace("-", "_")
encoding_alias: str
@@ -298,35 +309,17 @@ def iana_name(cp_name: str, strict: bool = True) -> str:
return encoding_iana
if strict:
- raise ValueError("Unable to retrieve IANA for '{}'".format(cp_name))
+ raise ValueError(f"Unable to retrieve IANA for '{cp_name}'")
return cp_name
-def range_scan(decoded_sequence: str) -> List[str]:
- ranges: Set[str] = set()
-
- for character in decoded_sequence:
- character_range: Optional[str] = unicode_range(character)
-
- if character_range is None:
- continue
-
- ranges.add(character_range)
-
- return list(ranges)
-
-
def cp_similarity(iana_name_a: str, iana_name_b: str) -> float:
if is_multi_byte_encoding(iana_name_a) or is_multi_byte_encoding(iana_name_b):
return 0.0
- decoder_a = importlib.import_module(
- "encodings.{}".format(iana_name_a)
- ).IncrementalDecoder
- decoder_b = importlib.import_module(
- "encodings.{}".format(iana_name_b)
- ).IncrementalDecoder
+ decoder_a = importlib.import_module(f"encodings.{iana_name_a}").IncrementalDecoder
+ decoder_b = importlib.import_module(f"encodings.{iana_name_b}").IncrementalDecoder
id_a: IncrementalDecoder = decoder_a(errors="ignore")
id_b: IncrementalDecoder = decoder_b(errors="ignore")
@@ -374,7 +367,7 @@ def cut_sequence_chunks(
strip_sig_or_bom: bool,
sig_payload: bytes,
is_multi_byte_decoder: bool,
- decoded_payload: Optional[str] = None,
+ decoded_payload: str | None = None,
) -> Generator[str, None, None]:
if decoded_payload and is_multi_byte_decoder is False:
for i in offsets:
diff --git a/src/charset_normalizer/version.py b/src/charset_normalizer/version.py
new file mode 100644
index 0000000..e5687e3
--- /dev/null
+++ b/src/charset_normalizer/version.py
@@ -0,0 +1,8 @@
+"""
+Expose version
+"""
+
+from __future__ import annotations
+
+__version__ = "3.4.2"
+VERSION = __version__.split(".")
diff --git a/tests/__init__.py b/tests/__init__.py
index 8b13789..e69de29 100644
--- a/tests/__init__.py
+++ b/tests/__init__.py
@@ -1 +0,0 @@
-
diff --git a/tests/test_base_detection.py b/tests/test_base_detection.py
index 3180a50..e4fb5fd 100644
--- a/tests/test_base_detection.py
+++ b/tests/test_base_detection.py
@@ -1,40 +1,52 @@
-from charset_normalizer.api import from_bytes
-from charset_normalizer.models import CharsetMatches
+from __future__ import annotations
import pytest
+from charset_normalizer.api import from_bytes
+from charset_normalizer.models import CharsetMatches
+
def test_empty():
- best_guess = from_bytes(b'').best()
+ best_guess = from_bytes(b"").best()
assert best_guess is not None, "Empty bytes payload SHOULD NOT return None"
- assert best_guess.encoding == "utf_8", "Empty bytes payload SHOULD be guessed as UTF-8 (arbitrary)"
+ assert (
+ best_guess.encoding == "utf_8"
+ ), "Empty bytes payload SHOULD be guessed as UTF-8 (arbitrary)"
assert len(best_guess.alphabets) == 0, ""
def test_bool_matches():
- guesses_not_empty = from_bytes(b'')
+ guesses_not_empty = from_bytes(b"")
guesses_empty = CharsetMatches([])
- assert bool(guesses_not_empty) is True, "Bool behaviour of CharsetMatches altered, should be True"
- assert bool(guesses_empty) is False, "Bool behaviour of CharsetMatches altered, should be False"
+ assert (
+ bool(guesses_not_empty) is True
+ ), "Bool behaviour of CharsetMatches altered, should be True"
+ assert (
+ bool(guesses_empty) is False
+ ), "Bool behaviour of CharsetMatches altered, should be False"
@pytest.mark.parametrize(
"payload, expected_encoding",
[
- (b'\xfe\xff', 'utf_16'),
- ('\uFEFF'.encode('gb18030'), 'gb18030'),
- (b'\xef\xbb\xbf', 'utf_8'),
- ("".encode('utf_32'), "utf_32")
- ]
+ (b"\xfe\xff", "utf_16"),
+ ("\uFEFF".encode("gb18030"), "gb18030"),
+ (b"\xef\xbb\xbf", "utf_8"),
+ ("".encode("utf_32"), "utf_32"),
+ ],
)
def test_empty_but_with_bom_or_sig(payload, expected_encoding):
best_guess = from_bytes(payload).best()
assert best_guess is not None, "Empty detection but with SIG/BOM has failed!"
- assert best_guess.encoding == expected_encoding, "Empty detection but with SIG/BOM is wrongly detected!"
- assert best_guess.raw == payload, "The RAW property should contain the original payload given for detection."
+ assert (
+ best_guess.encoding == expected_encoding
+ ), "Empty detection but with SIG/BOM is wrongly detected!"
+ assert (
+ best_guess.raw == payload
+ ), "The RAW property should contain the original payload given for detection."
assert best_guess.byte_order_mark is True, "The BOM/SIG property should return True"
assert str(best_guess) == "", "The cast to str SHOULD be empty"
@@ -42,16 +54,27 @@ def test_empty_but_with_bom_or_sig(payload, expected_encoding):
@pytest.mark.parametrize(
"payload, expected_encoding",
[
- ((u'\uFEFF' + '我没有埋怨,磋砣的只是一些时间。').encode('gb18030'), "gb18030",),
- ('我没有埋怨,磋砣的只是一些时间。'.encode('utf_32'), "utf_32",),
- ('我没有埋怨,磋砣的只是一些时间。'.encode('utf_8_sig'), "utf_8",),
- ]
+ (
+ ("\uFEFF" + "我没有埋怨,磋砣的只是一些时间。").encode("gb18030"),
+ "gb18030",
+ ),
+ (
+ "我没有埋怨,磋砣的只是一些时间。".encode("utf_32"),
+ "utf_32",
+ ),
+ (
+ "我没有埋怨,磋砣的只是一些时间。".encode("utf_8_sig"),
+ "utf_8",
+ ),
+ ],
)
def test_content_with_bom_or_sig(payload, expected_encoding):
best_guess = from_bytes(payload).best()
assert best_guess is not None, "Detection but with SIG/BOM has failed!"
- assert best_guess.encoding == expected_encoding, "Detection but with SIG/BOM is wrongly detected!"
+ assert (
+ best_guess.encoding == expected_encoding
+ ), "Detection but with SIG/BOM is wrongly detected!"
assert best_guess.byte_order_mark is True, "The BOM/SIG property should return True"
@@ -63,42 +86,49 @@ def test_content_with_bom_or_sig(payload, expected_encoding):
b'{"token": "g4UsPJdfzNkGW2jwmKDGDilKGKYtpF2X.mx3MaTWL1tL7CNn5U7DeCcodKX7S3lwwJPKNjBT8etY"}',
b"81f4ab054b39cb0e12701e734077d84264308f5fc79494fc5f159fa2ebc07b73c8cc0e98e009664a20986706f90146e8eefcb929ce1f74a8eab21369fdc70198",
b"{}",
- ]
+ ],
)
def test_obviously_ascii_content(payload):
best_guess = from_bytes(payload).best()
assert best_guess is not None, "Dead-simple ASCII detection has failed!"
- assert best_guess.encoding == "ascii", "Dead-simple ASCII detection is wrongly detected!"
+ assert (
+ best_guess.encoding == "ascii"
+ ), "Dead-simple ASCII detection is wrongly detected!"
@pytest.mark.parametrize(
"payload",
[
- '\u020d\x1b'.encode('utf-8'),
- 'h\xe9llo world!\n'.encode('utf_8'),
- '我没有埋怨,磋砣的只是一些时间。'.encode('utf_8'),
- 'Bсеки човек има право на образование. Oбразованието трябва да бъде безплатно, поне що се отнася до началното и основното образование.'.encode('utf_8'),
- 'Bсеки човек има право на образование.'.encode('utf_8'),
- "(° ͜ʖ °), creepy face, smiley 😀".encode("utf_8"),
- """["Financiën", "La France"]""".encode("utf_8"),
- "Qu'est ce que une étoile?".encode("utf_8"),
- """Financiën""".encode("utf_8"),
- "😀".encode("utf_8")
- ]
+ "\u020d\x1b".encode(),
+ "h\xe9llo world!\n".encode(),
+ "我没有埋怨,磋砣的只是一些时间。".encode(),
+ "Bсеки човек има право на образование. Oбразованието трябва да бъде безплатно, поне що се отнася до началното и основното образование.".encode(),
+ "Bсеки човек има право на образование.".encode(),
+ "(° ͜ʖ °), creepy face, smiley 😀".encode(),
+ """["Financiën", "La France"]""".encode(),
+ "Qu'est ce que une étoile?".encode(),
+ """Financiën""".encode(),
+ "😀".encode(),
+ ],
)
def test_obviously_utf8_content(payload):
best_guess = from_bytes(payload).best()
assert best_guess is not None, "Dead-simple UTF-8 detection has failed!"
- assert best_guess.encoding == "utf_8", "Dead-simple UTF-8 detection is wrongly detected!"
+ assert (
+ best_guess.encoding == "utf_8"
+ ), "Dead-simple UTF-8 detection is wrongly detected!"
def test_mb_cutting_chk():
# This payload should be wrongfully split and the autofix should ran automatically
# on chunks extraction.
- payload = b"\xbf\xaa\xbb\xe7\xc0\xfb \xbf\xb9\xbc\xf6 " \
- b" \xbf\xac\xb1\xb8\xc0\xda\xb5\xe9\xc0\xba \xba\xb9\xc0\xbd\xbc\xad\xb3\xaa " * 128
+ payload = (
+ b"\xbf\xaa\xbb\xe7\xc0\xfb \xbf\xb9\xbc\xf6 "
+ b" \xbf\xac\xb1\xb8\xc0\xda\xb5\xe9\xc0\xba \xba\xb9\xc0\xbd\xbc\xad\xb3\xaa "
+ * 128
+ )
guesses = from_bytes(payload, cp_isolation=["cp949"])
best_guess = guesses.best()
@@ -108,9 +138,7 @@ def test_mb_cutting_chk():
def test_alphabets_property():
- best_guess = from_bytes(
- "😀 Hello World! How affairs are going? 😀".encode("utf_8")
- ).best()
+ best_guess = from_bytes("😀 Hello World! How affairs are going? 😀".encode()).best()
assert "Basic Latin" in best_guess.alphabets
assert "Emoticons range(Emoji)" in best_guess.alphabets
@@ -119,7 +147,16 @@ def test_alphabets_property():
def test_doc_example_short_cp1251():
best_guess = from_bytes(
- 'Bсеки човек има право на образование.'.encode('cp1251')
+ "Bсеки човек има право на образование.".encode("cp1251")
).best()
assert best_guess.encoding == "cp1251"
+
+
+def test_direct_cmp_charset_match():
+ best_guess = from_bytes("😀 Hello World! How affairs are going? 😀".encode()).best()
+
+ assert best_guess == "utf_8"
+ assert best_guess == "utf-8"
+ assert best_guess != 8
+ assert best_guess != None
diff --git a/tests/test_cli.py b/tests/test_cli.py
index b73fb61..5f2777c 100644
--- a/tests/test_cli.py
+++ b/tests/test_cli.py
@@ -1,70 +1,46 @@
+from __future__ import annotations
+
import unittest
-from charset_normalizer.cli import cli_detect, query_yes_no
-from unittest.mock import patch
+from os import pardir, path, remove
from os.path import exists
-from os import remove, path, pardir
+from unittest.mock import patch
+
+from charset_normalizer.cli import cli_detect, query_yes_no
-DIR_PATH = path.join(
- path.dirname(path.realpath(__file__)),
- pardir
-)
+DIR_PATH = path.join(path.dirname(path.realpath(__file__)), pardir)
class TestCommandLineInterface(unittest.TestCase):
-
- @patch('builtins.input', lambda *args: 'y')
+ @patch("builtins.input", lambda *args: "y")
def test_simple_yes_input(self):
- self.assertTrue(
- query_yes_no('Are u willing to chill a little bit ?')
- )
+ self.assertTrue(query_yes_no("Are u willing to chill a little bit ?"))
- @patch('builtins.input', lambda *args: 'N')
+ @patch("builtins.input", lambda *args: "N")
def test_simple_no_input(self):
- self.assertFalse(
- query_yes_no('Are u willing to chill a little bit ?')
- )
+ self.assertFalse(query_yes_no("Are u willing to chill a little bit ?"))
def test_single_file(self):
-
- self.assertEqual(
- 0,
- cli_detect(
- [DIR_PATH + '/data/sample-arabic-1.txt']
- )
- )
+ self.assertEqual(0, cli_detect([DIR_PATH + "/data/sample-arabic-1.txt"]))
def test_version_output_success(self):
with self.assertRaises(SystemExit):
- cli_detect(
- ['--version']
- )
+ cli_detect(["--version"])
def test_single_file_normalize(self):
self.assertEqual(
- 0,
- cli_detect(
- [
- DIR_PATH + '/data/sample-arabic-1.txt',
- '--normalize'
- ]
- )
+ 0, cli_detect([DIR_PATH + "/data/sample-arabic-1.txt", "--normalize"])
)
- self.assertTrue(
- exists(DIR_PATH + '/data/sample-arabic-1.cp1256.txt')
- )
+ self.assertTrue(exists(DIR_PATH + "/data/sample-arabic-1.cp1256.txt"))
try:
- remove(DIR_PATH + '/data/sample-arabic-1.cp1256.txt')
+ remove(DIR_PATH + "/data/sample-arabic-1.cp1256.txt")
except:
pass
def test_single_verbose_file(self):
self.assertEqual(
- 0,
- cli_detect(
- [DIR_PATH + '/data/sample-arabic-1.txt', '--verbose']
- )
+ 0, cli_detect([DIR_PATH + "/data/sample-arabic-1.txt", "--verbose"])
)
def test_multiple_file(self):
@@ -72,11 +48,11 @@ def test_multiple_file(self):
0,
cli_detect(
[
- DIR_PATH + '/data/sample-arabic-1.txt',
- DIR_PATH + '/data/sample-french.txt',
- DIR_PATH + '/data/sample-chinese.txt'
+ DIR_PATH + "/data/sample-arabic-1.txt",
+ DIR_PATH + "/data/sample-french.txt",
+ DIR_PATH + "/data/sample-chinese.txt",
]
- )
+ ),
)
def test_with_alternative(self):
@@ -84,12 +60,12 @@ def test_with_alternative(self):
0,
cli_detect(
[
- '-a',
- DIR_PATH + '/data/sample-arabic-1.txt',
- DIR_PATH + '/data/sample-french.txt',
- DIR_PATH + '/data/sample-chinese.txt'
+ "-a",
+ DIR_PATH + "/data/sample-arabic-1.txt",
+ DIR_PATH + "/data/sample-french.txt",
+ DIR_PATH + "/data/sample-chinese.txt",
]
- )
+ ),
)
def test_with_minimal_output(self):
@@ -97,12 +73,12 @@ def test_with_minimal_output(self):
0,
cli_detect(
[
- '-m',
- DIR_PATH + '/data/sample-arabic-1.txt',
- DIR_PATH + '/data/sample-french.txt',
- DIR_PATH + '/data/sample-chinese.txt'
+ "-m",
+ DIR_PATH + "/data/sample-arabic-1.txt",
+ DIR_PATH + "/data/sample-french.txt",
+ DIR_PATH + "/data/sample-chinese.txt",
]
- )
+ ),
)
def test_with_minimal_and_alt(self):
@@ -110,47 +86,31 @@ def test_with_minimal_and_alt(self):
0,
cli_detect(
[
- '-m',
- '-a',
- DIR_PATH + '/data/sample-arabic-1.txt',
- DIR_PATH + '/data/sample-french.txt',
- DIR_PATH + '/data/sample-chinese.txt'
+ "-m",
+ "-a",
+ DIR_PATH + "/data/sample-arabic-1.txt",
+ DIR_PATH + "/data/sample-french.txt",
+ DIR_PATH + "/data/sample-chinese.txt",
]
- )
+ ),
)
def test_non_existent_file(self):
-
with self.assertRaises(SystemExit) as cm:
- cli_detect(
- [DIR_PATH + '/data/not_found_data.txt']
- )
+ cli_detect([DIR_PATH + "/data/not_found_data.txt"])
self.assertEqual(cm.exception.code, 2)
def test_replace_without_normalize(self):
-
self.assertEqual(
- cli_detect(
- [
- DIR_PATH + '/data/sample-arabic-1.txt',
- '--replace'
- ]
- ),
- 1
+ cli_detect([DIR_PATH + "/data/sample-arabic-1.txt", "--replace"]), 1
)
def test_force_replace_without_replace(self):
self.assertEqual(
- cli_detect(
- [
- DIR_PATH + '/data/sample-arabic-1.txt',
- '--force'
- ]
- ),
- 1
+ cli_detect([DIR_PATH + "/data/sample-arabic-1.txt", "--force"]), 1
)
-if __name__ == '__main__':
+if __name__ == "__main__":
unittest.main()
diff --git a/tests/test_coherence_detection.py b/tests/test_coherence_detection.py
index 7e39913..e5952d6 100644
--- a/tests/test_coherence_detection.py
+++ b/tests/test_coherence_detection.py
@@ -1,5 +1,14 @@
+from __future__ import annotations
+
import pytest
-from charset_normalizer.cd import encoding_languages, mb_encoding_languages, is_multi_byte_encoding, get_target_features, filter_alt_coherence_matches
+
+from charset_normalizer.cd import (
+ encoding_languages,
+ filter_alt_coherence_matches,
+ get_target_features,
+ is_multi_byte_encoding,
+ mb_encoding_languages,
+)
@pytest.mark.parametrize(
@@ -13,14 +22,20 @@
("johab", ["Korean"]),
("shift_jis", ["Japanese"]),
("mac_greek", ["Greek"]),
- ("iso2022_jp", ["Japanese"])
- ]
+ ("iso2022_jp", ["Japanese"]),
+ ],
)
def test_infer_language_from_cp(iana_encoding, expected_languages):
- languages = mb_encoding_languages(iana_encoding) if is_multi_byte_encoding(iana_encoding) else encoding_languages(iana_encoding)
+ languages = (
+ mb_encoding_languages(iana_encoding)
+ if is_multi_byte_encoding(iana_encoding)
+ else encoding_languages(iana_encoding)
+ )
for expected_language in expected_languages:
- assert expected_language in languages, "Wrongly detected language for given code page"
+ assert (
+ expected_language in languages
+ ), "Wrongly detected language for given code page"
@pytest.mark.parametrize(
@@ -31,8 +46,8 @@ def test_infer_language_from_cp(iana_encoding, expected_languages):
("Hebrew", False, False),
("Arabic", False, False),
("Vietnamese", True, True),
- ("Turkish", True, True)
- ]
+ ("Turkish", True, True),
+ ],
)
def test_target_features(language, expected_have_accents, expected_pure_latin):
target_have_accents, target_pure_latin = get_target_features(language)
@@ -44,11 +59,48 @@ def test_target_features(language, expected_have_accents, expected_pure_latin):
@pytest.mark.parametrize(
"matches, expected_return",
[
- ([("English", 0.88,), ("English—", 0.99)], [("English", 0.99)]),
- ([("English", 0.88,), ("English—", 0.99), ("English——", 0.999)], [("English", 0.999)]),
- ([("English", 0.88,), ("English—", 0.77)], [("English", 0.88)]),
- ([("English", 0.88,), ("Italian", 0.77)], [("English", 0.88), ("Italian", 0.77)]),
- ]
+ (
+ [
+ (
+ "English",
+ 0.88,
+ ),
+ ("English—", 0.99),
+ ],
+ [("English", 0.99)],
+ ),
+ (
+ [
+ (
+ "English",
+ 0.88,
+ ),
+ ("English—", 0.99),
+ ("English——", 0.999),
+ ],
+ [("English", 0.999)],
+ ),
+ (
+ [
+ (
+ "English",
+ 0.88,
+ ),
+ ("English—", 0.77),
+ ],
+ [("English", 0.88)],
+ ),
+ (
+ [
+ (
+ "English",
+ 0.88,
+ ),
+ ("Italian", 0.77),
+ ],
+ [("English", 0.88), ("Italian", 0.77)],
+ ),
+ ],
)
def test_filter_alt_coherence_matches(matches, expected_return):
results = filter_alt_coherence_matches(matches)
diff --git a/tests/test_detect_legacy.py b/tests/test_detect_legacy.py
index ec45aa7..bd2b035 100644
--- a/tests/test_detect_legacy.py
+++ b/tests/test_detect_legacy.py
@@ -1,75 +1,43 @@
+from __future__ import annotations
+
import unittest
+
from charset_normalizer.legacy import detect
class TestDetectLegacy(unittest.TestCase):
-
def test_detect_dict_keys(self):
+ r = detect(("\uFEFF" + "我没有埋怨,磋砣的只是一些时间。").encode("gb18030"))
- r = detect(
- (u'\uFEFF' + '我没有埋怨,磋砣的只是一些时间。').encode('gb18030')
- )
+ with self.subTest("encoding key present"):
+ self.assertIn("encoding", r.keys())
- with self.subTest('encoding key present'):
- self.assertIn(
- 'encoding',
- r.keys()
- )
+ with self.subTest("language key present"):
+ self.assertIn("language", r.keys())
- with self.subTest('language key present'):
- self.assertIn(
- 'language',
- r.keys()
- )
-
- with self.subTest('confidence key present'):
- self.assertIn(
- 'confidence',
- r.keys()
- )
+ with self.subTest("confidence key present"):
+ self.assertIn("confidence", r.keys())
def test_detect_dict_value_type(self):
+ r = detect("我没有埋怨,磋砣的只是一些时间。".encode())
- r = detect(
- '我没有埋怨,磋砣的只是一些时间。'.encode('utf_8')
- )
-
- with self.subTest('encoding instance of str'):
- self.assertIsInstance(
- r['encoding'],
- str
- )
+ with self.subTest("encoding instance of str"):
+ self.assertIsInstance(r["encoding"], str)
- with self.subTest('language instance of str'):
- self.assertIsInstance(
- r['language'],
- str
- )
+ with self.subTest("language instance of str"):
+ self.assertIsInstance(r["language"], str)
- with self.subTest('confidence instance of float'):
- self.assertIsInstance(
- r['confidence'],
- float
- )
+ with self.subTest("confidence instance of float"):
+ self.assertIsInstance(r["confidence"], float)
def test_detect_dict_value(self):
- r = detect(
- '我没有埋怨,磋砣的只是一些时间。'.encode('utf_32')
- )
+ r = detect("我没有埋怨,磋砣的只是一些时间。".encode("utf_32"))
- with self.subTest('encoding is equal to utf_32'):
- self.assertEqual(
- r['encoding'],
- 'UTF-32'
- )
+ with self.subTest("encoding is equal to utf_32"):
+ self.assertEqual(r["encoding"], "UTF-32")
def test_utf8_sig_not_striped(self):
- r = detect(
- "Hello World".encode('utf-8-sig')
- )
+ r = detect("Hello World".encode("utf-8-sig"))
with self.subTest("Verify that UTF-8-SIG is returned when using legacy detect"):
- self.assertEqual(
- r['encoding'],
- "UTF-8-SIG"
- )
+ self.assertEqual(r["encoding"], "UTF-8-SIG")
diff --git a/tests/test_edge_case.py b/tests/test_edge_case.py
index f324664..5b763ba 100644
--- a/tests/test_edge_case.py
+++ b/tests/test_edge_case.py
@@ -1,12 +1,59 @@
-from charset_normalizer import from_bytes
-import pytest
+from __future__ import annotations
+
import platform
-@pytest.mark.xfail(platform.python_version_tuple()[0] == "3" and platform.python_version_tuple()[1] == "7", reason="Unicode database is too old for this case (Python 3.7)")
+import pytest
+
+from charset_normalizer import from_bytes
+
+
+@pytest.mark.xfail(
+ platform.python_version_tuple()[0] == "3"
+ and platform.python_version_tuple()[1] == "7",
+ reason="Unicode database is too old for this case (Python 3.7)",
+)
def test_unicode_edge_case():
- payload = b'\xef\xbb\xbf\xf0\x9f\xa9\xb3'
+ payload = b"\xef\xbb\xbf\xf0\x9f\xa9\xb3"
best_guess = from_bytes(payload).best()
- assert best_guess is not None, "Payload should have given something, detection failure"
+ assert (
+ best_guess is not None
+ ), "Payload should have given something, detection failure"
assert best_guess.encoding == "utf_8", "UTF-8 payload wrongly detected"
+
+
+def test_issue_gh520():
+ """Verify that minorities does not strip basic latin characters!"""
+ payload = b"/includes/webform.compon\xd2\xaants.inc/"
+
+ best_guess = from_bytes(payload).best()
+
+ assert (
+ best_guess is not None
+ ), "Payload should have given something, detection failure"
+ assert "Basic Latin" in best_guess.alphabets
+
+
+def test_issue_gh509():
+ """Two common ASCII punctuations should render as-is."""
+ payload = b");"
+
+ best_guess = from_bytes(payload).best()
+
+ assert (
+ best_guess is not None
+ ), "Payload should have given something, detection failure"
+ assert "ascii" == best_guess.encoding
+
+
+def test_issue_gh498():
+ """This case was mistaken for utf-16-le, this should never happen again."""
+ payload = b"\x84\xae\xaa\xe3\xac\xa5\xad\xe2 Microsoft Word.docx"
+
+ best_guess = from_bytes(payload).best()
+
+ assert (
+ best_guess is not None
+ ), "Payload should have given something, detection failure"
+ assert "Cyrillic" in best_guess.alphabets
diff --git a/tests/test_full_detection.py b/tests/test_full_detection.py
index adff880..ff91e12 100644
--- a/tests/test_full_detection.py
+++ b/tests/test_full_detection.py
@@ -1,43 +1,50 @@
-from charset_normalizer.api import from_path
+from __future__ import annotations
+
+from os import pardir, path
+
import pytest
-from os import path, pardir
-DIR_PATH = path.join(
- path.dirname(path.realpath(__file__)),
- pardir
-)
+from charset_normalizer.api import from_path
+
+DIR_PATH = path.join(path.dirname(path.realpath(__file__)), pardir)
@pytest.mark.parametrize(
"input_data_file, expected_charset, expected_language",
[
- ('sample-arabic-1.txt', 'cp1256', 'Arabic'),
- ('sample-french-1.txt', 'cp1252', 'French'),
- ('sample-arabic.txt', 'utf_8', 'Arabic'),
- ('sample-russian-3.txt', 'utf_8', 'Russian'),
- ('sample-french.txt', 'utf_8', 'French'),
- ('sample-chinese.txt', 'big5', 'Chinese'),
- ('sample-greek.txt', 'cp1253', 'Greek'),
- ('sample-greek-2.txt', 'cp1253', 'Greek'),
- ('sample-hebrew-2.txt', 'cp1255', 'Hebrew'),
- ('sample-hebrew-3.txt', 'cp1255', 'Hebrew'),
- ('sample-bulgarian.txt', 'utf_8', 'Bulgarian'),
- ('sample-english.bom.txt', 'utf_8', 'English'),
- ('sample-spanish.txt', 'utf_8', 'Spanish'),
- ('sample-korean.txt', 'cp949', 'Korean'),
- ('sample-turkish.txt', 'cp1254', 'Turkish'),
- ('sample-russian-2.txt', 'utf_8', 'Russian'),
- ('sample-russian.txt', 'mac_cyrillic', 'Russian'),
- ('sample-polish.txt', 'utf_8', 'Polish'),
- ]
+ ("sample-arabic-1.txt", "cp1256", "Arabic"),
+ ("sample-french-1.txt", "cp1252", "French"),
+ ("sample-arabic.txt", "utf_8", "Arabic"),
+ ("sample-russian-3.txt", "utf_8", "Russian"),
+ ("sample-french.txt", "utf_8", "French"),
+ ("sample-chinese.txt", "big5", "Chinese"),
+ ("sample-greek.txt", "cp1253", "Greek"),
+ ("sample-greek-2.txt", "cp1253", "Greek"),
+ ("sample-hebrew-2.txt", "cp1255", "Hebrew"),
+ ("sample-hebrew-3.txt", "cp1255", "Hebrew"),
+ ("sample-bulgarian.txt", "utf_8", "Bulgarian"),
+ ("sample-english.bom.txt", "utf_8", "English"),
+ ("sample-spanish.txt", "utf_8", "Spanish"),
+ ("sample-korean.txt", "cp949", "Korean"),
+ ("sample-turkish.txt", "cp1254", "Turkish"),
+ ("sample-russian-2.txt", "utf_8", "Russian"),
+ ("sample-russian.txt", "mac_cyrillic", "Russian"),
+ ("sample-polish.txt", "utf_8", "Polish"),
+ ],
)
def test_elementary_detection(
input_data_file: str,
expected_charset: str,
expected_language: str,
):
- best_guess = from_path(DIR_PATH + "/data/{}".format(input_data_file)).best()
+ best_guess = from_path(DIR_PATH + f"/data/{input_data_file}").best()
- assert best_guess is not None, "Elementary detection has failed upon '{}'".format(input_data_file)
- assert best_guess.encoding == expected_charset, "Elementary charset detection has failed upon '{}'".format(input_data_file)
- assert best_guess.language == expected_language, "Elementary language detection has failed upon '{}'".format(input_data_file)
+ assert (
+ best_guess is not None
+ ), f"Elementary detection has failed upon '{input_data_file}'"
+ assert (
+ best_guess.encoding == expected_charset
+ ), f"Elementary charset detection has failed upon '{input_data_file}'"
+ assert (
+ best_guess.language == expected_language
+ ), f"Elementary language detection has failed upon '{input_data_file}'"
diff --git a/tests/test_isbinary.py b/tests/test_isbinary.py
index b134a8a..841474f 100644
--- a/tests/test_isbinary.py
+++ b/tests/test_isbinary.py
@@ -1,28 +1,29 @@
-import pytest
+from __future__ import annotations
+
import typing
-from io import BytesIO
from base64 import b64decode
+from io import BytesIO
+from os import pardir, path
+
+import pytest
+
from charset_normalizer import is_binary
-from os import path, pardir
-DIR_PATH = path.join(
- path.dirname(path.realpath(__file__)),
- pardir
-)
+DIR_PATH = path.join(path.dirname(path.realpath(__file__)), pardir)
@pytest.mark.parametrize(
"raw, expected",
[
- (b'\x00\x5f\x2f\xff'*50, True),
+ (b"\x00\x5f\x2f\xff" * 50, True),
(b64decode("R0lGODlhAQABAAAAACw="), True),
(BytesIO(b64decode("R0lGODlhAQABAAAAACw=")), True),
- ('sample-polish.txt', False),
- ('sample-arabic.txt', False)
- ]
+ ("sample-polish.txt", False),
+ ("sample-arabic.txt", False),
+ ],
)
-def test_isbinary(raw: typing.Union[bytes, typing.BinaryIO, str], expected: bool) -> None:
+def test_isbinary(raw: bytes | typing.BinaryIO | str, expected: bool) -> None:
if isinstance(raw, str):
- raw = DIR_PATH + "/data/{}".format(raw)
+ raw = DIR_PATH + f"/data/{raw}"
assert is_binary(raw) is expected
diff --git a/tests/test_large_payload.py b/tests/test_large_payload.py
index 04526d3..7fc28fa 100644
--- a/tests/test_large_payload.py
+++ b/tests/test_large_payload.py
@@ -1,3 +1,5 @@
+from __future__ import annotations
+
import pytest
from charset_normalizer import from_bytes
@@ -5,29 +7,43 @@
def test_large_payload_u8_sig_basic_entry():
- payload = ('0' * TOO_BIG_SEQUENCE).encode("utf_8_sig")
+ payload = ("0" * TOO_BIG_SEQUENCE).encode("utf_8_sig")
best_guess = from_bytes(payload).best()
assert best_guess is not None, "Large U8 payload case detection completely failed"
- assert best_guess.encoding == "utf_8", "Large U8 payload case detection wrongly detected!"
+ assert (
+ best_guess.encoding == "utf_8"
+ ), "Large U8 payload case detection wrongly detected!"
assert best_guess.bom is True, "SIG/BOM property should be True"
- assert len(best_guess.raw) == len(payload), "Large payload should remain untouched when accessed through .raw"
- assert best_guess._string is not None, "str should be decoded before direct access (sig available)"
+ assert len(best_guess.raw) == len(
+ payload
+ ), "Large payload should remain untouched when accessed through .raw"
+ assert (
+ best_guess._string is not None
+ ), "str should be decoded before direct access (sig available)"
def test_large_payload_ascii_basic_entry():
- payload = ('0' * TOO_BIG_SEQUENCE).encode("utf_8")
+ payload = ("0" * TOO_BIG_SEQUENCE).encode("utf_8")
best_guess = from_bytes(payload).best()
- assert best_guess is not None, "Large ASCII payload case detection completely failed"
- assert best_guess.encoding == "ascii", "Large ASCII payload case detection wrongly detected!"
+ assert (
+ best_guess is not None
+ ), "Large ASCII payload case detection completely failed"
+ assert (
+ best_guess.encoding == "ascii"
+ ), "Large ASCII payload case detection wrongly detected!"
assert best_guess.bom is False, "SIG/BOM property should be False"
- assert len(best_guess.raw) == len(payload), "Large payload should remain untouched when accessed through .raw"
+ assert len(best_guess.raw) == len(
+ payload
+ ), "Large payload should remain untouched when accessed through .raw"
assert best_guess._string is None, "str should not be decoded until direct access"
def test_misleading_large_sequence():
- content = (("hello simple ascii " * TOO_BIG_SEQUENCE) + ('我没有埋怨,磋砣的只是一些时间。 磋砣的只是一些时间。')) .encode('utf_8')
+ content = (
+ ("hello simple ascii " * TOO_BIG_SEQUENCE) + ("我没有埋怨,磋砣的只是一些时间。 磋砣的只是一些时间。")
+ ).encode("utf_8")
guesses = from_bytes(content)
@@ -35,5 +51,5 @@ def test_misleading_large_sequence():
match = guesses.best()
assert match is not None
assert match._string is not None, "str should be cached as only match"
- assert match.encoding == 'utf_8'
+ assert match.encoding == "utf_8"
assert str(match) is not None
diff --git a/tests/test_logging.py b/tests/test_logging.py
index f44820e..ad2413e 100644
--- a/tests/test_logging.py
+++ b/tests/test_logging.py
@@ -1,9 +1,12 @@
-import pytest
+from __future__ import annotations
+
import logging
-from charset_normalizer.utils import set_logging_handler
-from charset_normalizer.api import from_bytes, explain_handler
+import pytest
+
+from charset_normalizer.api import explain_handler, from_bytes
from charset_normalizer.constant import TRACE
+from charset_normalizer.utils import set_logging_handler
class TestLogBehaviorClass:
@@ -14,34 +17,32 @@ def setup_method(self):
self.logger.level = logging.WARNING
def test_explain_true_behavior(self, caplog):
- test_sequence = b'This is a test sequence of bytes that should be sufficient'
+ test_sequence = b"This is a test sequence of bytes that should be sufficient"
from_bytes(test_sequence, steps=1, chunk_size=50, explain=True)
assert explain_handler not in self.logger.handlers
for record in caplog.records:
assert record.levelname in ["Level 5", "DEBUG"]
def test_explain_false_handler_set_behavior(self, caplog):
- test_sequence = b'This is a test sequence of bytes that should be sufficient'
+ test_sequence = b"This is a test sequence of bytes that should be sufficient"
set_logging_handler(level=TRACE, format_string="%(message)s")
from_bytes(test_sequence, steps=1, chunk_size=50, explain=False)
- assert any(isinstance(hdl, logging.StreamHandler) for hdl in self.logger.handlers)
+ assert any(
+ isinstance(hdl, logging.StreamHandler) for hdl in self.logger.handlers
+ )
for record in caplog.records:
assert record.levelname in ["Level 5", "DEBUG"]
assert "Encoding detection: ascii is most likely the one." in caplog.text
def test_set_stream_handler(self, caplog):
- set_logging_handler(
- "charset_normalizer", level=logging.DEBUG
- )
+ set_logging_handler("charset_normalizer", level=logging.DEBUG)
self.logger.debug("log content should log with default format")
for record in caplog.records:
assert record.levelname in ["Level 5", "DEBUG"]
assert "log content should log with default format" in caplog.text
def test_set_stream_handler_format(self, caplog):
- set_logging_handler(
- "charset_normalizer", format_string="%(message)s"
- )
+ set_logging_handler("charset_normalizer", format_string="%(message)s")
self.logger.info("log content should only be this message")
assert caplog.record_tuples == [
(
diff --git a/tests/test_mess_detection.py b/tests/test_mess_detection.py
index d70fee4..4089f82 100644
--- a/tests/test_mess_detection.py
+++ b/tests/test_mess_detection.py
@@ -1,27 +1,48 @@
+from __future__ import annotations
+
import pytest
+
from charset_normalizer.md import mess_ratio
@pytest.mark.parametrize(
"content, min_expected_ratio, max_expected_ratio",
[
- ('典肇乎庚辰年十二月廿一,及己丑年二月十九,收各方語言二百五十,合逾七百萬目;二十大卷佔八成,單英文卷亦過二百萬。悉文乃天下有志共筆而成;有意助之,幾網路、隨纂作,大典茁焉。', 0., 0.),
- ('العقلية , التنويم المغناطيسي و / أو الاقتراح', 0., 0.),
- ("RadoZ تـــعــــديــل الـــتــــوقــيــــت مـــن قــبــل", 0., 0.),
+ (
+ "典肇乎庚辰年十二月廿一,及己丑年二月十九,收各方語言二百五十,合逾七百萬目;二十大卷佔八成,單英文卷亦過二百萬。悉文乃天下有志共筆而成;有意助之,幾網路、隨纂作,大典茁焉。",
+ 0.0,
+ 0.0,
+ ),
+ ("العقلية , التنويم المغناطيسي و / أو الاقتراح", 0.0, 0.0),
+ ("RadoZ تـــعــــديــل الـــتــــوقــيــــت مـــن قــبــل", 0.0, 0.0),
("Cehennemin Sava■þ²s²'da kim?", 0.1, 0.5),
- ("´Á¥½³ø§i -- ±i®Ìºû, ³¯·Ø©v", 0.5, 1.),
- ("ïstanbul, T■rkiye'nin en kalabal»k, iktisadi ve k■lt■rel aÓ»dan en —nemli", 0.1, 0.5),
- ("Parce que Óa, c'est la vÕritable histoire de la rencontre avec votre Tante Robin.", 0.01, 0.5),
- ("""ØĢØŠØģاØĶŲ ŲŲ ØĢŲ Ø§ŲŲØ§Øģ ŲŲŲ
Ų
ا ØģŲŲبШ쨧ØĶŲŲŲØ ØŊØđŲØ§ ŲØģŲ
Øđ ØđŲ (ŲØąŲØŊŲ) ŲØ§ŲØŪا؊Ų
""", 0.8, 3.0),
+ ("´Á¥½³ø§i -- ±i®Ìºû, ³¯·Ø©v", 0.5, 1.0),
+ (
+ "ïstanbul, T■rkiye'nin en kalabal»k, iktisadi ve k■lt■rel aÓ»dan en —nemli",
+ 0.1,
+ 0.5,
+ ),
+ (
+ "Parce que Óa, c'est la vÕritable histoire de la rencontre avec votre Tante Robin.",
+ 0.01,
+ 0.5,
+ ),
+ (
+ """ØĢØŠØģاØĶŲ ŲŲ ØĢŲ Ø§ŲŲØ§Øģ ŲŲŲ
Ų
ا ØģŲŲبШ쨧ØĶŲŲŲØ ØŊØđŲØ§ ŲØģŲ
Øđ ØđŲ (ŲØąŲØŊŲ) ŲØ§ŲØŪا؊Ų
""",
+ 0.8,
+ 3.0,
+ ),
("""ÇáÚŞáíÉ , ÇáÊäæíã ÇáãÛäÇØíÓí æ / Ãæ ÇáÇŞÊÑÇÍ""", 0.8, 2.5),
- ("""hishamkoc@yahoo.com ุชุฑุฌู
ููุฉ ููุดูููุงู
ุงููููููููุงูRadoZ ุชูููุนููููุฏูููู ุงููููุชูููููููููููููุช ู
ูููู ูููุจููู""", 0.5, 2.0)
-
- ]
+ (
+ """hishamkoc@yahoo.com ุชุฑุฌู
ููุฉ ููุดูููุงู
ุงููููููููุงูRadoZ ุชูููุนููููุฏูููู ุงููููุชูููููููููููููุช ู
ูููู ูููุจููู""",
+ 0.5,
+ 2.0,
+ ),
+ ],
)
def test_mess_detection(content, min_expected_ratio, max_expected_ratio):
- calculated_mess_ratio = mess_ratio(
- content,
- maximum_threshold=1.
- )
+ calculated_mess_ratio = mess_ratio(content, maximum_threshold=1.0)
- assert min_expected_ratio <= calculated_mess_ratio <= max_expected_ratio, "The mess detection ratio calculated for given content is not well adjusted!"
+ assert (
+ min_expected_ratio <= calculated_mess_ratio <= max_expected_ratio
+ ), "The mess detection ratio calculated for given content is not well adjusted!"
diff --git a/tests/test_preemptive_detection.py b/tests/test_preemptive_detection.py
index 042415b..e56c4a1 100644
--- a/tests/test_preemptive_detection.py
+++ b/tests/test_preemptive_detection.py
@@ -1,5 +1,8 @@
+from __future__ import annotations
+
import pytest
+from charset_normalizer import CharsetMatch
from charset_normalizer.utils import any_specified_encoding
@@ -9,18 +12,81 @@
(b'', "euc_jp"),
(b'', "utf_8"),
(b'', None),
- (b'# coding: utf-8', "utf_8"),
- (b'', 'utf_8'),
- (b'', 'ascii'),
- (b'', 'johab'),
- (b'', 'cp037'),
- (b'', "cp1252"),
+ (b"# coding: utf-8", "utf_8"),
+ (b'', "utf_8"),
+ (b'', "ascii"),
+ (b'', "johab"),
+ (b'', "cp037"),
+ (b"", "cp1252"),
(b'', "cp1256"),
- ]
+ ],
)
def test_detect_most_common_body_encoding(payload, expected_encoding):
- specified_encoding = any_specified_encoding(
- payload
+ specified_encoding = any_specified_encoding(payload)
+
+ assert (
+ specified_encoding == expected_encoding
+ ), "Unable to determine properly encoding from given body"
+
+
+@pytest.mark.parametrize(
+ "payload, expected_outcome",
+ [
+ (
+ b'',
+ b'',
+ ),
+ (
+ b'',
+ b'',
+ ),
+ (
+ b'',
+ b'',
+ ),
+ (b"# coding: utf-8", b"# coding: utf-8"),
+ (
+ b'',
+ b'',
+ ),
+ (
+ b'',
+ b'',
+ ),
+ (
+ b'',
+ b'',
+ ),
+ (
+ b"",
+ b"",
+ ),
+ (
+ b'',
+ b'',
+ ),
+ ],
+)
+def test_preemptive_mark_replacement(payload, expected_outcome):
+ """
+ When generating (to Unicode converted) bytes, we want to change any potential declarative charset
+ to utf-8. This test that.
+ """
+ specified_encoding = any_specified_encoding(payload)
+
+ detected_encoding = (
+ specified_encoding if specified_encoding is not None else "utf-8"
)
- assert specified_encoding == expected_encoding, "Unable to determine properly encoding from given body"
+ m = CharsetMatch(
+ payload,
+ detected_encoding,
+ 0.0,
+ False,
+ [],
+ preemptive_declaration=specified_encoding,
+ )
+
+ transformed_output = m.output()
+
+ assert transformed_output == expected_outcome
diff --git a/tests/test_utils.py b/tests/test_utils.py
index 5c603b3..a0cc088 100644
--- a/tests/test_utils.py
+++ b/tests/test_utils.py
@@ -1,6 +1,10 @@
+from __future__ import annotations
+
import logging
+
import pytest
-from charset_normalizer.utils import is_accentuated, cp_similarity, set_logging_handler
+
+from charset_normalizer.utils import cp_similarity, is_accentuated, set_logging_handler
@pytest.mark.parametrize(