From 1bde248d7d9c037a927646a40ecf8d93704bce2c Mon Sep 17 00:00:00 2001 From: Quentin Ambard Date: Wed, 5 Nov 2025 11:54:11 -0500 Subject: [PATCH 1/2] update to 0.6.33 - fix feature store, mlops, ai agent and support v3 env --- dbdemos/conf.py | 1 + dbdemos/installer_genie.py | 77 ++++++++++++++++++++++---------------- dbdemos/notebook_parser.py | 4 +- dbdemos/packager.py | 8 ++-- main.py | 12 +++--- 5 files changed, 58 insertions(+), 44 deletions(-) diff --git a/dbdemos/conf.py b/dbdemos/conf.py index 7e28747..3c27798 100644 --- a/dbdemos/conf.py +++ b/dbdemos/conf.py @@ -242,6 +242,7 @@ def __init__(self, path: str, json_conf: dict, catalog:str = None, schema: str = self.dashboards = json_conf.get('dashboards', []) self.sql_queries = json_conf.get('sql_queries', []) self.bundle = json_conf.get('bundle', False) + self.env_version = json_conf.get('env_version', 2) self.data_folders: List[DataFolder] = [] for data_folder in json_conf.get('data_folders', []): diff --git a/dbdemos/installer_genie.py b/dbdemos/installer_genie.py index 21c81b1..d7a6cd6 100644 --- a/dbdemos/installer_genie.py +++ b/dbdemos/installer_genie.py @@ -217,40 +217,53 @@ def load_data_to_volume(self, ws: WorkspaceClient, data_folder: DataFolder, demo import requests import collections + dbutils = self.installer.get_dbutils() try: - # Get list of files from GitHub API, to avoid adding a S3 boto dependency just for this - github_path = f"https://api.github.com/repos/databricks-demos/dbdemos-dataset/contents/{data_folder.source_folder}" - if debug: - print(f"Getting files from {github_path}") - files = requests.get(github_path).json() - if 'message' in files: - print(f"Error getting files from {github_path}: {files}") - files = [f['download_url'] for f in files] - - if debug: - print(f"Found {len(files)} files in GitHub repo for {data_folder.source_folder}") - - def copy_file(file_url): - if not file_url.endswith('/'): - file_name = file_url.split('/')[-1] - folder = data_folder.target_volume_folder_name if data_folder.target_volume_folder_name else data_folder.source_folder - target_path = f"/Volumes/{demo_conf.catalog}/{demo_conf.schema}/{InstallerGenie.VOLUME_NAME}/{folder}/{file_name}" - - s3_url = file_url.replace("https://raw.githubusercontent.com/databricks-demos/dbdemos-dataset/main/", - "https://dbdemos-dataset.s3.amazonaws.com/") - - if debug: - print(f"Copying {s3_url} to {target_path}") - response = requests.get(s3_url) - response.raise_for_status() - if debug: - print(f"File {file_name} in memory. sending to volume...") - ws.files.upload(target_path, response.content, overwrite=True) + folder = data_folder.target_volume_folder_name if data_folder.target_volume_folder_name else data_folder.source_folder + #first try with a dbutils copy if available + copied_successfully = False + if dbutils is not None: + try: + dbutils.fs.cp(f"s3://dbdemos-dataset/{data_folder.source_folder}", f"/Volumes/{demo_conf.catalog}/{demo_conf.schema}/{InstallerGenie.VOLUME_NAME}/{folder}", recurse=True) + copied_successfully = True + except Exception as e: if debug: - print(f"File {file_name} in volume!") - - with ThreadPoolExecutor(max_workers=5) as executor: - collections.deque(executor.map(copy_file, files)) + print(f"Error copying {data_folder.source_folder} to {f'/Volumes/{demo_conf.catalog}/{demo_conf.schema}/{InstallerGenie.VOLUME_NAME}/{folder}'} using dbutils fs.cp: {e}") + if debug: + print(f"Copied {data_folder.source_folder} to {f'/Volumes/{demo_conf.catalog}/{demo_conf.schema}/{InstallerGenie.VOLUME_NAME}/{folder}'} using dbutils fs.cp") + if not copied_successfully: + # Get list of files from GitHub API, to avoid adding a S3 boto dependency just for this + github_path = f"https://api.github.com/repos/databricks-demos/dbdemos-dataset/contents/{data_folder.source_folder}" + if debug: + print(f"Getting files from {github_path}") + files = requests.get(github_path).json() + if 'message' in files: + print(f"Error getting files from {github_path}: {files}") + files = [f['download_url'] for f in files] + + if debug: + print(f"Found {len(files)} files in GitHub repo for {data_folder.source_folder}") + + def copy_file(file_url): + if not file_url.endswith('/'): + file_name = file_url.split('/')[-1] + target_path = f"/Volumes/{demo_conf.catalog}/{demo_conf.schema}/{InstallerGenie.VOLUME_NAME}/{folder}/{file_name}" + + s3_url = file_url.replace("https://raw.githubusercontent.com/databricks-demos/dbdemos-dataset/main/", + "https://dbdemos-dataset.s3.amazonaws.com/") + + if debug: + print(f"Copying {s3_url} to {target_path}") + response = requests.get(s3_url) + response.raise_for_status() + if debug: + print(f"File {file_name} in memory. sending to volume...") + ws.files.upload(target_path, response.content, overwrite=True) + if debug: + print(f"File {file_name} in volume!") + + with ThreadPoolExecutor(max_workers=5) as executor: + collections.deque(executor.map(copy_file, files)) except Exception as e: raise DataLoaderException(f"Error loading data from S3: {str(e)}") diff --git a/dbdemos/notebook_parser.py b/dbdemos/notebook_parser.py index a70d2ea..3b2079a 100644 --- a/dbdemos/notebook_parser.py +++ b/dbdemos/notebook_parser.py @@ -324,7 +324,7 @@ def add_javascript_to_minisite_relative_links(self, notebook_path): #Set the environment metadata to the notebook. # TODO: might want to re-evaluate this once we move to ipynb format as it'll be set in the ipynb file, as metadata. - def set_environement_metadata(self, client_version: str = "2"): + def set_environement_metadata(self, client_version: str = "3"): content = json.loads(self.content) env_metadata = content.get("environmentMetadata", {}) if env_metadata is None: @@ -332,7 +332,7 @@ def set_environement_metadata(self, client_version: str = "2"): if ("client" not in env_metadata or env_metadata["client"] is None or int(env_metadata["client"]) < int(client_version)): - env_metadata["client"] = client_version + env_metadata["client"] = str(client_version) content["environmentMetadata"] = env_metadata self.content = json.dumps(content) diff --git a/dbdemos/packager.py b/dbdemos/packager.py index d71193c..e9ffde5 100644 --- a/dbdemos/packager.py +++ b/dbdemos/packager.py @@ -58,11 +58,11 @@ def process_file_content(self, file, destination_path, extension = ""): with open(destination_path + extension, "wb") as f: f.write(file_content) - def process_notebook_content(self, html, full_path): + def process_notebook_content(self, demo_conf: DemoConf, html, full_path): #Replace notebook content. parser = NotebookParser(html) parser.remove_uncomment_tag() - parser.set_environement_metadata() + parser.set_environement_metadata(demo_conf.env_version) parser.remove_dbdemos_build() #parser.remove_static_settings() parser.hide_commands_and_results() @@ -106,7 +106,7 @@ def download_notebook_html(notebook: DemoNotebook): if 'error_code' in file: raise Exception(f"Couldn't find file {repo_path} in workspace. Check notebook path in bundle conf file. {file['error_code']} - {file['message']}") html = base64.b64decode(file['content']).decode('utf-8') - return self.process_notebook_content(html, full_path+".html") + return self.process_notebook_content(demo_conf, html, full_path+".html") elif status['object_type'] == 'DIRECTORY': folder = self.db.get("2.0/workspace/export", {"path": repo_path, "format": "AUTO", "direct_download": True}) return self.process_file_content(folder, full_path, ".zip") @@ -124,7 +124,7 @@ def download_notebook_html(notebook: DemoNotebook): if "views" not in notebook_result: raise Exception(f"couldn't get notebook for run {tasks[0]['run_id']} - {notebook.path}. {demo_conf.name}. You probably did a run repair. Please re run the job. - {notebook_result}") html = notebook_result["views"][0]["content"] - return self.process_notebook_content(html, full_path+".html") + return self.process_notebook_content(demo_conf, html, full_path+".html") requires_global_setup_v2 = False diff --git a/main.py b/main.py index 786f3d1..2ced8dc 100644 --- a/main.py +++ b/main.py @@ -35,12 +35,12 @@ def bundle(): # Run the jobs (only if there is a new commit since the last time, or failure, or force execution) - bundler.start_and_wait_bundle_jobs(force_execution = False, skip_execution=True, recreate_jobs=False) + bundler.start_and_wait_bundle_jobs(force_execution = False, skip_execution=False, recreate_jobs=False) packager = Packager(conf, bundler) packager.package_all() -#bundle() +bundle() #Loads conf to install on cse2. with open("local_conf_E2FE.json", "r") as r: @@ -97,13 +97,13 @@ def bundle(): #dbdemos.install("lakehouse-fsi-credit", "/Users/quentin.ambard@databricks.com/test_install_quentin", True, c['username'], c['pat_token'], c['url'], catalog='main', schema='quentin_test2', cloud="AWS", start_cluster = False, skip_dashboards=False) #dbdemos.install("lakehouse-fsi-fraud", "/Users/quentin.ambard@databricks.com/test_install_quentin", True, c['username'], c['pat_token'], c['url'], catalog='main', schema='quentin_test2', cloud="AWS", start_cluster = False, skip_dashboards=False) -dbdemos.install("lakehouse-iot-platform", "/Users/quentin.ambard@databricks.com/test_install_quentin", True, c['username'], c['pat_token'], c['url'], catalog='main', schema='quentin_test3', cloud="AWS", start_cluster = False) -dbdemos.install("pipeline-bike", "/Users/quentin.ambard@databricks.com/test_install_quentin", True, c['username'], c['pat_token'], c['url'], catalog='main', schema='quentin_test3', cloud="AWS", start_cluster = False) +#dbdemos.install("lakehouse-iot-platform", "/Users/quentin.ambard@databricks.com/test_install_quentin", True, c['username'], c['pat_token'], c['url'], catalog='main', schema='quentin_test3', cloud="AWS", start_cluster = False) +#dbdemos.install("pipeline-bike", "/Users/quentin.ambard@databricks.com/test_install_quentin", True, c['username'], c['pat_token'], c['url'], catalog='main', schema='quentin_test3', cloud="AWS", start_cluster = False) -#dbdemos.install("feature-store", "/Users/quentin.ambard@databricks.com/test_install_quentin", True, c['username'], c['pat_token'], c['url'], cloud="AWS", use_current_cluster=False, current_cluster_id=c["current_cluster_id"]) +dbdemos.install("feature-store", "/Users/quentin.ambard@databricks.com/test_install_quentin", True, c['username'], c['pat_token'], c['url'], cloud="AWS", use_current_cluster=False, current_cluster_id=c["current_cluster_id"]) #dbdemos.install("delta-lake", "/Users/quentin.ambard@databricks.com/test_install_quentin", True, c['username'], c['pat_token'], c['url'], cloud="GCP") #dbdemos.install("delta-lake", "/Users/quentin.ambard@databricks.com/test_install_quentin", True, c['username'], c['pat_token'], c['url'], cloud="Azure", use_current_cluster=True, current_cluster_id=c["current_cluster_id"]) -#dbdemos.install("mlops-end2end", "/Users/quentin.ambard@databricks.com/test_install_quentin", True, c['username'], c['pat_token'], c['url'], cloud="AWS", skip_dashboards=True, schema='test_quentin_rag', catalog='dbdemos') +dbdemos.install("mlops-end2end", "/Users/quentin.ambard@databricks.com/test_install_quentin", True, c['username'], c['pat_token'], c['url'], cloud="AWS", skip_dashboards=True, schema='test_quentin_rag', catalog='dbdemos') #dbdemos.install("pandas-on-spark", "/Users/quentin.ambard@databricks.com/test_install_quentin", True, c['username'], c['pat_token'], c['url'], cloud="AWS") #dbdemos.install("delta-sharing-airlines", "/Users/quentin.ambard@databricks.com/test_install_quentin", True, c['username'], c['pat_token'], c['url']) #dbdemos.install("dlt-loans", "/Users/quentin.ambard@databricks.com/test_install_quentin", True, c['username'], c['pat_token'], c['url']) From d5675c73b992345f19f156e92368279ad3899e69 Mon Sep 17 00:00:00 2001 From: Quentin Ambard Date: Thu, 13 Nov 2025 08:52:21 -0500 Subject: [PATCH 2/2] Bump version to 0.6.34 --- dbdemos/__init__.py | 2 +- setup.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/dbdemos/__init__.py b/dbdemos/__init__.py index ddf0e39..c422bdb 100644 --- a/dbdemos/__init__.py +++ b/dbdemos/__init__.py @@ -1,4 +1,4 @@ -__version__ = "0.6.28" +__version__ = "0.6.34" from .dbdemos import list_demos, install, create_cluster, help, install_all, check_status_all, check_status, get_html_list_demos diff --git a/setup.py b/setup.py index 30eb1f4..a51d2bb 100644 --- a/setup.py +++ b/setup.py @@ -5,7 +5,7 @@ #this will be the package name you will see, e.g. the output of 'conda list' in anaconda prompt name = 'dbdemos', #some version number you may wish to add - increment this after every update - version='0.6.28', + version='0.6.34', author="Databricks", author_email=["quentin.ambard@databricks.com", "cal.reynolds@databricks.com"], description="Install databricks demos: notebooks, Delta Live Table Pipeline, DBSQL Dashboards, ML Models etc.",