Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
54 changes: 54 additions & 0 deletions MIGRATION.md
Original file line number Diff line number Diff line change
Expand Up @@ -464,6 +464,60 @@ builder = PromptBuilder(
builder.run(name="John") # greeting renders as ""
```

### Pipeline deserialization is gated by a module allowlist

**What changed:** `Pipeline.load`, `Pipeline.loads`, and `Pipeline.from_dict` now refuse to import classes from modules outside a trusted-module allowlist and raise a `DeserializationError` instead. The default allowlist contains `haystack`, `haystack_integrations`, `haystack_experimental`, `builtins`, `typing`, and `collections`. Pipelines that reference custom components, callables, or types in other packages will fail to load until those modules are explicitly allowed.

In addition, `default_from_dict` now rejects nested `{"type": "..."}` dictionaries whose key is not an `__init__` parameter of the parent class — this can surface pre-existing YAML bugs (typos, leftovers from removed parameters, stale snapshots).

**Why:** Loading a pipeline from YAML used to dynamically import any class referenced in the file, which made a crafted YAML capable of causing arbitrary classes to be imported and instantiated. Gating imports through an allowlist closes that gap while leaving Haystack's own packages working out of the box.

**How to migrate:**

If your pipeline only references components from `haystack`, `haystack_integrations`, or `haystack_experimental`, no action is needed.

Otherwise, extend the allowlist via one of the four mechanisms below.

Before (v2.x), all modules implicitly trusted:
```python
from haystack import Pipeline

# Worked for any class on the import path, including third-party packages.
with open("pipeline.yaml") as fp:
pipeline = Pipeline.load(fp)
```

After (v3.0), pick one of the following options. The first two scope the trust to a single call; the others extend it process-wide.

```python
# 1. Per-call kwarg — recommended for application code that knows exactly which extra
# packages a given YAML needs.
from haystack import Pipeline

with open("pipeline.yaml") as fp:
pipeline_a = Pipeline.load(fp, allowed_modules=["mypkg.*", "anotherpkg.components.*"])

# 2. Per-call bypass — equivalent to "I fully trust this YAML; skip the allowlist".
# Mirrors the `yaml.safe_load` / `yaml.unsafe_load` convention.
with open("pipeline.yaml") as fp:
pipeline_b = Pipeline.load(fp, unsafe=True)

# 3. Process-wide programmatic — call once at startup, e.g. in your application's
# entry point or a custom integration package's __init__.
from haystack.core.serialization import allow_deserialization_module

allow_deserialization_module("mypkg.*")
with open("pipeline.yaml") as fp:
pipeline_c = Pipeline.load(fp) # `mypkg.*` is now trusted for every load in this process.
```

```bash
# 4. Environment variable — useful for ops/deployments where code shouldn't change.
# Comma-separated patterns; read at runtime on every deserialization call.
export HAYSTACK_DESERIALIZATION_ALLOWLIST="mypkg.*,otherpkg.*"
```

Patterns are matched as prefixes by default (`"mypkg"` matches `mypkg` and any submodule), or as `fnmatch` globs if they contain `*`, `?`, or `[` somewhere other than a trailing `.*`.
### Generators removed

**What changed:** `OpenAIGenerator`, `AzureOpenAIGenerator`, `HuggingFaceAPIGenerator`, and `HuggingFaceLocalGenerator` have been removed.
Expand Down
67 changes: 58 additions & 9 deletions haystack/core/pipeline/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@
component_to_dict,
generate_qualified_class_name,
)
from haystack.core.serialization_security import _check_module_allowed, _deserialization_context
from haystack.core.type_utils import (
ConversionStrategyType,
_convert_value,
Expand Down Expand Up @@ -173,7 +174,13 @@ def to_dict(self) -> dict[str, Any]:

@classmethod
def from_dict(
cls: type[T], data: dict[str, Any], callbacks: DeserializationCallbacks | None = None, **kwargs: Any
cls: type[T],
data: dict[str, Any],
callbacks: DeserializationCallbacks | None = None,
*,
allowed_modules: list[str] | None = None,
unsafe: bool = False,
**kwargs: Any,
) -> T:
"""
Deserializes the pipeline from a dictionary.
Expand All @@ -182,12 +189,28 @@ def from_dict(
Dictionary to deserialize from.
:param callbacks:
Callbacks to invoke during deserialization.
:param allowed_modules:
Additional module patterns whose classes may be imported during deserialization.
By default, only modules under `haystack`, `haystack_integrations`, `haystack_experimental`,
`builtins`, `typing`, and `collections` are trusted. See
`haystack.core.serialization.allow_deserialization_module` for the matching semantics.
:param unsafe:
If `True`, bypass the deserialization allowlist entirely. Only use this when you fully
trust the source of the serialized data — any class in any importable module can be
instantiated.
:param kwargs:
`components`: a dictionary of `{name: instance}` to reuse instances of components instead of creating new
ones.
:returns:
Deserialized component.
"""
with _deserialization_context(allowed_modules=allowed_modules, unsafe=unsafe):
return cls._from_dict_impl(data, callbacks, **kwargs)

@classmethod
def _from_dict_impl(
cls: type[T], data: dict[str, Any], callbacks: DeserializationCallbacks | None = None, **kwargs: Any
) -> T:
data_copy = _deepcopy_with_exceptions(data) # to prevent modification of original data
metadata = data_copy.get("metadata", {})
max_runs_per_component = data_copy.get("max_runs_per_component", 100)
Expand All @@ -206,28 +229,32 @@ def from_dict(
if "type" not in component_data:
raise PipelineError(f"Missing 'type' in component '{name}'")

if component_data["type"] not in component.registry:
component_type = component_data["type"]
if isinstance(component_type, str) and "." in component_type:
_check_module_allowed(component_type.rsplit(".", 1)[0])

if component_type not in component.registry:
try:
# Import the module first...
module, _ = component_data["type"].rsplit(".", 1)
module, _ = component_type.rsplit(".", 1)
logger.debug("Trying to import module {module_name}", module_name=module)
type_serialization.thread_safe_import(module)
# ...then try again
if component_data["type"] not in component.registry:
if component_type not in component.registry:
raise PipelineError( # noqa: TRY301
f"Successfully imported module '{module}' but couldn't find "
f"'{component_data['type']}' in the component registry.\n"
f"'{component_type}' in the component registry.\n"
f"The component might be registered under a different path. "
f"Here are the registered components:\n {list(component.registry.keys())}\n"
)
except (ImportError, PipelineError, ValueError) as e:
raise PipelineError(
f"Component '{component_data['type']}' (name: '{name}') not imported. Please "
f"Component '{component_type}' (name: '{name}') not imported. Please "
f"check that the package is installed and the component path is correct."
) from e

# Create a new one
component_class = component.registry[component_data["type"]]
component_class = component.registry[component_type]

try:
instance = component_from_dict(component_class, component_data, name, callbacks)
Expand Down Expand Up @@ -287,6 +314,9 @@ def loads(
data: str | bytes | bytearray,
marshaller: Marshaller = DEFAULT_MARSHALLER,
callbacks: DeserializationCallbacks | None = None,
*,
allowed_modules: list[str] | None = None,
unsafe: bool = False,
) -> T:
"""
Creates a `Pipeline` object from the string representation passed in the `data` argument.
Expand All @@ -297,6 +327,14 @@ def loads(
The Marshaller used to create the string representation. Defaults to `YamlMarshaller`.
:param callbacks:
Callbacks to invoke during deserialization.
:param allowed_modules:
Additional module patterns whose classes may be imported during deserialization.
By default, only modules under `haystack`, `haystack_integrations`, `haystack_experimental`,
`builtins`, `typing`, and `collections` are trusted.
:param unsafe:
If `True`, bypass the deserialization allowlist entirely. Only use this when you fully
trust the source of the serialized data — any class in any importable module can be
instantiated.
:raises DeserializationError:
If an error occurs during deserialization.
:returns:
Expand All @@ -310,14 +348,17 @@ def loads(
"caused by malformed or invalid syntax in the serialized representation."
) from e

return cls.from_dict(deserialized_data, callbacks)
return cls.from_dict(deserialized_data, callbacks, allowed_modules=allowed_modules, unsafe=unsafe)

@classmethod
def load(
cls: type[T],
fp: TextIO,
marshaller: Marshaller = DEFAULT_MARSHALLER,
callbacks: DeserializationCallbacks | None = None,
*,
allowed_modules: list[str] | None = None,
unsafe: bool = False,
) -> T:
"""
Creates a `Pipeline` object a string representation.
Expand All @@ -331,12 +372,20 @@ def load(
The Marshaller used to create the string representation. Defaults to `YamlMarshaller`.
:param callbacks:
Callbacks to invoke during deserialization.
:param allowed_modules:
Additional module patterns whose classes may be imported during deserialization.
By default, only modules under `haystack`, `haystack_integrations`, `haystack_experimental`,
`builtins`, `typing`, and `collections` are trusted.
:param unsafe:
If `True`, bypass the deserialization allowlist entirely. Only use this when you fully
trust the source of the serialized data — any class in any importable module can be
instantiated.
:raises DeserializationError:
If an error occurs during deserialization.
:returns:
A `Pipeline` object.
"""
return cls.loads(fp.read(), marshaller, callbacks)
return cls.loads(fp.read(), marshaller, callbacks, allowed_modules=allowed_modules, unsafe=unsafe)

def add_component(self, name: str, instance: Component) -> None:
"""
Expand Down
52 changes: 51 additions & 1 deletion haystack/core/serialization.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,10 +10,22 @@
from haystack import logging
from haystack.core.component.component import _hook_component_init
from haystack.core.errors import DeserializationError, SerializationError
from haystack.core.serialization_security import _check_module_allowed, allow_deserialization_module
from haystack.utils.auth import Secret
from haystack.utils.device import ComponentDevice
from haystack.utils.type_serialization import thread_safe_import

__all__ = [
"DeserializationCallbacks",
"allow_deserialization_module",
"component_from_dict",
"component_to_dict",
"default_from_dict",
"default_to_dict",
"generate_qualified_class_name",
"import_class_by_name",
]

logger = logging.getLogger(__name__)


Expand Down Expand Up @@ -287,6 +299,8 @@ def default_from_dict(cls: type[T], data: dict[str, Any]) -> T:
if data["type"] != generate_qualified_class_name(cls):
raise DeserializationError(f"Class '{data['type']}' can't be deserialized as '{cls.__name__}'")

valid_init_param_names = _init_parameter_names(cls)

# Automatically detect and deserialize objects with from_dict methods
for key, value in init_params.items():
if isinstance(value, dict) and "type" in value:
Expand All @@ -299,6 +313,12 @@ def default_from_dict(cls: type[T], data: dict[str, Any]) -> T:
init_params[key] = ComponentDevice.from_dict(value)
# If type looks like a fully qualified class name, try to import it and deserialize
elif isinstance(type_value, str) and "." in type_value:
# Reject before importing if the parent class does not accept this parameter.
# This blocks YAML that smuggles untrusted classes into unused parameter slots.
if valid_init_param_names is not None and key not in valid_init_param_names:
raise DeserializationError(
f"Refusing to deserialize unknown parameter '{key}' for '{cls.__name__}'."
)
try:
imported_class = import_class_by_name(type_value)
if hasattr(imported_class, "from_dict") and callable(imported_class.from_dict):
Expand All @@ -311,6 +331,30 @@ def default_from_dict(cls: type[T], data: dict[str, Any]) -> T:
return cls(**init_params)


def _init_parameter_names(cls: type[object]) -> set[str] | None:
"""
Return the set of init parameter names accepted by `cls`.

Returns `None` if the constructor accepts arbitrary keyword arguments (`**kwargs`) — in
which case we cannot validate keys.
"""
try:
signature = inspect.signature(cls.__init__)
except (TypeError, ValueError):
return None
names: set[str] = set()
for name, param in signature.parameters.items():
if name == "self":
continue
if param.kind is inspect.Parameter.VAR_KEYWORD:
# Constructor accepts **kwargs; we cannot tell whether `key` is a real parameter.
return None
if param.kind is inspect.Parameter.VAR_POSITIONAL:
continue
names.add(name)
return names


def import_class_by_name(fully_qualified_name: str) -> type[object]:
"""
Utility function to import (load) a class object based on its fully qualified class name.
Expand All @@ -319,12 +363,18 @@ def import_class_by_name(fully_qualified_name: str) -> type[object]:
It splits the name into module path and class name, imports the module,
and returns the class object.

For security, the module path is checked against the deserialization allowlist
(see :mod:`haystack.core.serialization_security`). Modules outside the allowlist
are rejected with a :class:`DeserializationError`.

:param fully_qualified_name: the fully qualified class name as a string
:returns: the class object.
:raises ImportError: If the class cannot be imported or found.
:raises DeserializationError: If the module is not on the deserialization allowlist.
"""
module_path, class_name = fully_qualified_name.rsplit(".", 1)
_check_module_allowed(module_path)
try:
module_path, class_name = fully_qualified_name.rsplit(".", 1)
logger.debug(
"Attempting to import class '{cls_name}' from module '{md_path}'", cls_name=class_name, md_path=module_path
)
Expand Down
Loading
Loading