Trying to use joblib-spark to run on Databricks:
from sklearn.utils import parallel_backend
from sklearn.model_selection import cross_val_score
from sklearn import datasets
from sklearn import svm
from joblibspark import register_spark
register_spark() # register spark backend
iris = datasets.load_iris()
clf = svm.SVC(kernel='linear', C=1)
with parallel_backend('spark', n_jobs=3):
scores = cross_val_score(clf, iris.data, iris.target, cv=5)
print(scores)
The issue is that register_spark() causing:
ImportError: To use the spark.distributed backend you must install the pyspark and packages.
File /databricks/python/lib/python3.12/site-packages/joblibspark/__init__.py:29, in register_spark()
28 try:
---> 29 from .backend import register # pylint: disable=C0415
30 register()
File /databricks/python_shell/lib/dbruntime/autoreload/discoverability/hook.py:81, in AutoreloadDiscoverabilityHook.pre_run_cell.<locals>.patched_import(name, *args, **kwargs)
80 self._should_hint = True
---> 81 module = self._original_builtins_import(name, *args, **kwargs)
82 if (fname := fname or get_allowed_file_name_or_none(module)) is not None:
File /databricks/python/lib/python3.12/site-packages/joblibspark/backend.py:28
26 from joblib.parallel \
27 import AutoBatchingMixin, ParallelBackendBase, register_parallel_backend, SequentialBackend
---> 28 from joblib._parallel_backends import SafeFunction
30 from py4j.clientserver import ClientServer
ImportError: cannot import name 'SafeFunction' from 'joblib._parallel_backends' (/databricks/python/lib/python3.12/site-packages/joblib/_parallel_backends.py)
During handling of the above exception, another exception occurred:
ImportError Traceback (most recent call last)
File <command-2725362822092390>, line 7
4 from sklearn import svm
5 from joblibspark import register_spark
----> 7 register_spark() # register spark backend
9 iris = datasets.load_iris()
10 clf = svm.SVC(kernel='linear', C=1)
File /databricks/python/lib/python3.12/site-packages/joblibspark/__init__.py:34, in register_spark()
31 except ImportError:
32 msg = ("To use the spark.distributed backend you must install "
33 "the pyspark and packages.\n\n")
---> 34 raise ImportError(msg)
Clearly the missing spark is not an issue because Databricks has spark session in default.
Trying to use joblib-spark to run on Databricks:
The issue is that register_spark() causing:
Clearly the missing spark is not an issue because Databricks has spark session in default.