Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 11 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,17 @@
All notable changes to charset-normalizer will be documented in this file. This project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/).

## [3.4.0](https://github.com/Ousret/charset_normalizer/compare/3.3.2...3.4.0) (2024-10-08)

### Added
- Argument `--no-preemptive` in the CLI to prevent the detector to search for hints.
- Support for Python 3.13 (#512)

### Fixed
- Relax the TypeError exception thrown when trying to compare a CharsetMatch with anything else than a CharsetMatch.
- Improved the general reliability of the detector based on user feedbacks. (#520) (#509) (#498) (#407) (#537)
- Declared charset in content (preemptive detection) not changed when converting to utf-8 bytes. (#381)

## [3.3.2](https://github.com/Ousret/charset_normalizer/compare/3.3.1...3.3.2) (2023-10-31)

### Fixed
Expand Down
9 changes: 6 additions & 3 deletions build-requirements.txt
Original file line number Diff line number Diff line change
@@ -1,6 +1,9 @@
# in the meantime we migrate to pyproject.toml
# this represent the minimum requirement to build (for the optional speedup)
mypy==1.6.1; python_version >= '3.8'
--find-links https://github.com/mypyc/mypy_mypyc-wheels/releases/expanded_assets/v1.12.0+dev.b2deaaecf1a11e13bc962558992b5f2d5701f295
mypy==1.11.2; python_version >= '3.8' and python_version < '3.13'
mypy==1.12.0; python_version >= '3.13'
mypy==1.4.1; python_version < '3.8'
build==0.10.0
wheel==0.41.2
build>=0.10.0,<2
wheel==0.42.0
setuptools>=68,<76
78 changes: 60 additions & 18 deletions charset_normalizer/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -159,6 +159,8 @@ def from_bytes(

results: CharsetMatches = CharsetMatches()

early_stop_results: CharsetMatches = CharsetMatches()

sig_encoding, sig_payload = identify_sig_or_bom(sequences)

if sig_encoding is not None:
Expand Down Expand Up @@ -221,16 +223,20 @@ def from_bytes(
try:
if is_too_large_sequence and is_multi_byte_decoder is False:
str(
sequences[: int(50e4)]
if strip_sig_or_bom is False
else sequences[len(sig_payload) : int(50e4)],
(
sequences[: int(50e4)]
if strip_sig_or_bom is False
else sequences[len(sig_payload) : int(50e4)]
),
encoding=encoding_iana,
)
else:
decoded_payload = str(
sequences
if strip_sig_or_bom is False
else sequences[len(sig_payload) :],
(
sequences
if strip_sig_or_bom is False
else sequences[len(sig_payload) :]
),
encoding=encoding_iana,
)
except (UnicodeDecodeError, LookupError) as e:
Expand Down Expand Up @@ -367,7 +373,13 @@ def from_bytes(
and not lazy_str_hard_failure
):
fallback_entry = CharsetMatch(
sequences, encoding_iana, threshold, False, [], decoded_payload
sequences,
encoding_iana,
threshold,
False,
[],
decoded_payload,
preemptive_declaration=specified_encoding,
)
if encoding_iana == specified_encoding:
fallback_specified = fallback_entry
Expand Down Expand Up @@ -421,28 +433,58 @@ def from_bytes(
),
)

results.append(
CharsetMatch(
sequences,
encoding_iana,
mean_mess_ratio,
bom_or_sig_available,
cd_ratios_merged,
decoded_payload,
)
current_match = CharsetMatch(
sequences,
encoding_iana,
mean_mess_ratio,
bom_or_sig_available,
cd_ratios_merged,
(
decoded_payload
if (
is_too_large_sequence is False
or encoding_iana in [specified_encoding, "ascii", "utf_8"]
)
else None
),
preemptive_declaration=specified_encoding,
)

results.append(current_match)

if (
encoding_iana in [specified_encoding, "ascii", "utf_8"]
and mean_mess_ratio < 0.1
):
# If md says nothing to worry about, then... stop immediately!
if mean_mess_ratio == 0.0:
logger.debug(
"Encoding detection: %s is most likely the one.",
current_match.encoding,
)
if explain:
logger.removeHandler(explain_handler)
logger.setLevel(previous_logger_level)
return CharsetMatches([current_match])

early_stop_results.append(current_match)

if (
len(early_stop_results)
and (specified_encoding is None or specified_encoding in tested)
and "ascii" in tested
and "utf_8" in tested
):
probable_result: CharsetMatch = early_stop_results.best() # type: ignore[assignment]
logger.debug(
"Encoding detection: %s is most likely the one.", encoding_iana
"Encoding detection: %s is most likely the one.",
probable_result.encoding,
)
if explain:
logger.removeHandler(explain_handler)
logger.setLevel(previous_logger_level)
return CharsetMatches([results[encoding_iana]])

return CharsetMatches([probable_result])

if encoding_iana == sig_encoding:
logger.debug(
Expand Down
36 changes: 30 additions & 6 deletions charset_normalizer/cli/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -109,6 +109,14 @@ def cli_detect(argv: Optional[List[str]] = None) -> int:
dest="force",
help="Replace file without asking if you are sure, use this flag with caution.",
)
parser.add_argument(
"-i",
"--no-preemptive",
action="store_true",
default=False,
dest="no_preemptive",
help="Disable looking at a charset declaration to hint the detector.",
)
parser.add_argument(
"-t",
"--threshold",
Expand All @@ -133,31 +141,47 @@ def cli_detect(argv: Optional[List[str]] = None) -> int:
args = parser.parse_args(argv)

if args.replace is True and args.normalize is False:
if args.files:
for my_file in args.files:
my_file.close()
print("Use --replace in addition of --normalize only.", file=sys.stderr)
return 1

if args.force is True and args.replace is False:
if args.files:
for my_file in args.files:
my_file.close()
print("Use --force in addition of --replace only.", file=sys.stderr)
return 1

if args.threshold < 0.0 or args.threshold > 1.0:
if args.files:
for my_file in args.files:
my_file.close()
print("--threshold VALUE should be between 0. AND 1.", file=sys.stderr)
return 1

x_ = []

for my_file in args.files:
matches = from_fp(my_file, threshold=args.threshold, explain=args.verbose)
matches = from_fp(
my_file,
threshold=args.threshold,
explain=args.verbose,
preemptive_behaviour=args.no_preemptive is False,
)

best_guess = matches.best()

if best_guess is None:
print(
'Unable to identify originating encoding for "{}". {}'.format(
my_file.name,
"Maybe try increasing maximum amount of chaos."
if args.threshold < 1.0
else "",
(
"Maybe try increasing maximum amount of chaos."
if args.threshold < 1.0
else ""
),
),
file=sys.stderr,
)
Expand Down Expand Up @@ -258,8 +282,8 @@ def cli_detect(argv: Optional[List[str]] = None) -> int:
try:
x_[0].unicode_path = join(dir_path, ".".join(o_))

with open(x_[0].unicode_path, "w", encoding="utf-8") as fp:
fp.write(str(best_guess))
with open(x_[0].unicode_path, "wb") as fp:
fp.write(best_guess.output())
except IOError as e:
print(str(e), file=sys.stderr)
if my_file.closed is False:
Expand Down
2 changes: 2 additions & 0 deletions charset_normalizer/constant.py
Original file line number Diff line number Diff line change
Expand Up @@ -544,6 +544,8 @@
"|",
'"',
"-",
"(",
")",
}


Expand Down
15 changes: 13 additions & 2 deletions charset_normalizer/legacy.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,24 @@
from typing import Any, Dict, Optional, Union
from __future__ import annotations

from typing import TYPE_CHECKING, Any, Optional
from warnings import warn

from .api import from_bytes
from .constant import CHARDET_CORRESPONDENCE

# TODO: remove this check when dropping Python 3.7 support
if TYPE_CHECKING:
from typing_extensions import TypedDict

class ResultDict(TypedDict):
encoding: Optional[str]
language: str
confidence: Optional[float]


def detect(
byte_str: bytes, should_rename_legacy: bool = False, **kwargs: Any
) -> Dict[str, Optional[Union[str, float]]]:
) -> ResultDict:
"""
chardet legacy method
Detect the encoding of the given byte string. It should be mostly backward-compatible.
Expand Down
19 changes: 16 additions & 3 deletions charset_normalizer/md.py
Original file line number Diff line number Diff line change
Expand Up @@ -236,7 +236,7 @@ def reset(self) -> None: # pragma: no cover

@property
def ratio(self) -> float:
if self._character_count <= 24:
if self._character_count <= 13:
return 0.0

ratio_of_suspicious_range_usage: float = (
Expand All @@ -260,6 +260,7 @@ def __init__(self) -> None:

self._buffer: str = ""
self._buffer_accent_count: int = 0
self._buffer_glyph_count: int = 0

def eligible(self, character: str) -> bool:
return True
Expand All @@ -279,6 +280,14 @@ def feed(self, character: str) -> None:
and is_thai(character) is False
):
self._foreign_long_watch = True
if (
is_cjk(character)
or is_hangul(character)
or is_katakana(character)
or is_hiragana(character)
or is_thai(character)
):
self._buffer_glyph_count += 1
return
if not self._buffer:
return
Expand All @@ -291,17 +300,20 @@ def feed(self, character: str) -> None:
self._character_count += buffer_length

if buffer_length >= 4:
if self._buffer_accent_count / buffer_length > 0.34:
if self._buffer_accent_count / buffer_length >= 0.5:
self._is_current_word_bad = True
# Word/Buffer ending with an upper case accentuated letter are so rare,
# that we will consider them all as suspicious. Same weight as foreign_long suspicious.
if (
elif (
is_accentuated(self._buffer[-1])
and self._buffer[-1].isupper()
and all(_.isupper() for _ in self._buffer) is False
):
self._foreign_long_count += 1
self._is_current_word_bad = True
elif self._buffer_glyph_count == 1:
self._is_current_word_bad = True
self._foreign_long_count += 1
if buffer_length >= 24 and self._foreign_long_watch:
camel_case_dst = [
i
Expand All @@ -325,6 +337,7 @@ def feed(self, character: str) -> None:
self._foreign_long_watch = False
self._buffer = ""
self._buffer_accent_count = 0
self._buffer_glyph_count = 0
elif (
character not in {"<", ">", "-", "=", "~", "|", "_"}
and character.isdigit() is False
Expand Down
Loading
Loading