Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion dbdemos/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
__version__ = "0.6.28"
__version__ = "0.6.34"

from .dbdemos import list_demos, install, create_cluster, help, install_all, check_status_all, check_status, get_html_list_demos

1 change: 1 addition & 0 deletions dbdemos/conf.py
Original file line number Diff line number Diff line change
Expand Up @@ -242,6 +242,7 @@ def __init__(self, path: str, json_conf: dict, catalog:str = None, schema: str =
self.dashboards = json_conf.get('dashboards', [])
self.sql_queries = json_conf.get('sql_queries', [])
self.bundle = json_conf.get('bundle', False)
self.env_version = json_conf.get('env_version', 2)

self.data_folders: List[DataFolder] = []
for data_folder in json_conf.get('data_folders', []):
Expand Down
77 changes: 45 additions & 32 deletions dbdemos/installer_genie.py
Original file line number Diff line number Diff line change
Expand Up @@ -217,40 +217,53 @@ def load_data_to_volume(self, ws: WorkspaceClient, data_folder: DataFolder, demo

import requests
import collections
dbutils = self.installer.get_dbutils()
try:
# Get list of files from GitHub API, to avoid adding a S3 boto dependency just for this
github_path = f"https://api.github.com/repos/databricks-demos/dbdemos-dataset/contents/{data_folder.source_folder}"
if debug:
print(f"Getting files from {github_path}")
files = requests.get(github_path).json()
if 'message' in files:
print(f"Error getting files from {github_path}: {files}")
files = [f['download_url'] for f in files]

if debug:
print(f"Found {len(files)} files in GitHub repo for {data_folder.source_folder}")

def copy_file(file_url):
if not file_url.endswith('/'):
file_name = file_url.split('/')[-1]
folder = data_folder.target_volume_folder_name if data_folder.target_volume_folder_name else data_folder.source_folder
target_path = f"/Volumes/{demo_conf.catalog}/{demo_conf.schema}/{InstallerGenie.VOLUME_NAME}/{folder}/{file_name}"

s3_url = file_url.replace("https://raw.githubusercontent.com/databricks-demos/dbdemos-dataset/main/",
"https://dbdemos-dataset.s3.amazonaws.com/")

if debug:
print(f"Copying {s3_url} to {target_path}")
response = requests.get(s3_url)
response.raise_for_status()
if debug:
print(f"File {file_name} in memory. sending to volume...")
ws.files.upload(target_path, response.content, overwrite=True)
folder = data_folder.target_volume_folder_name if data_folder.target_volume_folder_name else data_folder.source_folder
#first try with a dbutils copy if available
copied_successfully = False
if dbutils is not None:
try:
dbutils.fs.cp(f"s3://dbdemos-dataset/{data_folder.source_folder}", f"/Volumes/{demo_conf.catalog}/{demo_conf.schema}/{InstallerGenie.VOLUME_NAME}/{folder}", recurse=True)
copied_successfully = True
except Exception as e:
if debug:
print(f"File {file_name} in volume!")

with ThreadPoolExecutor(max_workers=5) as executor:
collections.deque(executor.map(copy_file, files))
print(f"Error copying {data_folder.source_folder} to {f'/Volumes/{demo_conf.catalog}/{demo_conf.schema}/{InstallerGenie.VOLUME_NAME}/{folder}'} using dbutils fs.cp: {e}")
if debug:
print(f"Copied {data_folder.source_folder} to {f'/Volumes/{demo_conf.catalog}/{demo_conf.schema}/{InstallerGenie.VOLUME_NAME}/{folder}'} using dbutils fs.cp")
if not copied_successfully:
# Get list of files from GitHub API, to avoid adding a S3 boto dependency just for this
github_path = f"https://api.github.com/repos/databricks-demos/dbdemos-dataset/contents/{data_folder.source_folder}"
if debug:
print(f"Getting files from {github_path}")
files = requests.get(github_path).json()
if 'message' in files:
print(f"Error getting files from {github_path}: {files}")
files = [f['download_url'] for f in files]

if debug:
print(f"Found {len(files)} files in GitHub repo for {data_folder.source_folder}")

def copy_file(file_url):
if not file_url.endswith('/'):
file_name = file_url.split('/')[-1]
target_path = f"/Volumes/{demo_conf.catalog}/{demo_conf.schema}/{InstallerGenie.VOLUME_NAME}/{folder}/{file_name}"

s3_url = file_url.replace("https://raw.githubusercontent.com/databricks-demos/dbdemos-dataset/main/",
"https://dbdemos-dataset.s3.amazonaws.com/")

if debug:
print(f"Copying {s3_url} to {target_path}")
response = requests.get(s3_url)
response.raise_for_status()
if debug:
print(f"File {file_name} in memory. sending to volume...")
ws.files.upload(target_path, response.content, overwrite=True)
if debug:
print(f"File {file_name} in volume!")

with ThreadPoolExecutor(max_workers=5) as executor:
collections.deque(executor.map(copy_file, files))

except Exception as e:
raise DataLoaderException(f"Error loading data from S3: {str(e)}")
Expand Down
4 changes: 2 additions & 2 deletions dbdemos/notebook_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -324,15 +324,15 @@ def add_javascript_to_minisite_relative_links(self, notebook_path):

#Set the environment metadata to the notebook.
# TODO: might want to re-evaluate this once we move to ipynb format as it'll be set in the ipynb file, as metadata.
def set_environement_metadata(self, client_version: str = "2"):
def set_environement_metadata(self, client_version: str = "3"):
content = json.loads(self.content)
env_metadata = content.get("environmentMetadata", {})
if env_metadata is None:
env_metadata = {}
if ("client" not in env_metadata or
env_metadata["client"] is None or
int(env_metadata["client"]) < int(client_version)):
env_metadata["client"] = client_version
env_metadata["client"] = str(client_version)
content["environmentMetadata"] = env_metadata
self.content = json.dumps(content)

Expand Down
8 changes: 4 additions & 4 deletions dbdemos/packager.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,11 +58,11 @@ def process_file_content(self, file, destination_path, extension = ""):
with open(destination_path + extension, "wb") as f:
f.write(file_content)

def process_notebook_content(self, html, full_path):
def process_notebook_content(self, demo_conf: DemoConf, html, full_path):
#Replace notebook content.
parser = NotebookParser(html)
parser.remove_uncomment_tag()
parser.set_environement_metadata()
parser.set_environement_metadata(demo_conf.env_version)
parser.remove_dbdemos_build()
#parser.remove_static_settings()
parser.hide_commands_and_results()
Expand Down Expand Up @@ -106,7 +106,7 @@ def download_notebook_html(notebook: DemoNotebook):
if 'error_code' in file:
raise Exception(f"Couldn't find file {repo_path} in workspace. Check notebook path in bundle conf file. {file['error_code']} - {file['message']}")
html = base64.b64decode(file['content']).decode('utf-8')
return self.process_notebook_content(html, full_path+".html")
return self.process_notebook_content(demo_conf, html, full_path+".html")
elif status['object_type'] == 'DIRECTORY':
folder = self.db.get("2.0/workspace/export", {"path": repo_path, "format": "AUTO", "direct_download": True})
return self.process_file_content(folder, full_path, ".zip")
Expand All @@ -124,7 +124,7 @@ def download_notebook_html(notebook: DemoNotebook):
if "views" not in notebook_result:
raise Exception(f"couldn't get notebook for run {tasks[0]['run_id']} - {notebook.path}. {demo_conf.name}. You probably did a run repair. Please re run the job. - {notebook_result}")
html = notebook_result["views"][0]["content"]
return self.process_notebook_content(html, full_path+".html")
return self.process_notebook_content(demo_conf, html, full_path+".html")


requires_global_setup_v2 = False
Expand Down
12 changes: 6 additions & 6 deletions main.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,12 +35,12 @@ def bundle():


# Run the jobs (only if there is a new commit since the last time, or failure, or force execution)
bundler.start_and_wait_bundle_jobs(force_execution = False, skip_execution=True, recreate_jobs=False)
bundler.start_and_wait_bundle_jobs(force_execution = False, skip_execution=False, recreate_jobs=False)

packager = Packager(conf, bundler)
packager.package_all()

#bundle()
bundle()

#Loads conf to install on cse2.
with open("local_conf_E2FE.json", "r") as r:
Expand Down Expand Up @@ -97,13 +97,13 @@ def bundle():
#dbdemos.install("lakehouse-fsi-credit", "/Users/quentin.ambard@databricks.com/test_install_quentin", True, c['username'], c['pat_token'], c['url'], catalog='main', schema='quentin_test2', cloud="AWS", start_cluster = False, skip_dashboards=False)
#dbdemos.install("lakehouse-fsi-fraud", "/Users/quentin.ambard@databricks.com/test_install_quentin", True, c['username'], c['pat_token'], c['url'], catalog='main', schema='quentin_test2', cloud="AWS", start_cluster = False, skip_dashboards=False)

dbdemos.install("lakehouse-iot-platform", "/Users/quentin.ambard@databricks.com/test_install_quentin", True, c['username'], c['pat_token'], c['url'], catalog='main', schema='quentin_test3', cloud="AWS", start_cluster = False)
dbdemos.install("pipeline-bike", "/Users/quentin.ambard@databricks.com/test_install_quentin", True, c['username'], c['pat_token'], c['url'], catalog='main', schema='quentin_test3', cloud="AWS", start_cluster = False)
#dbdemos.install("lakehouse-iot-platform", "/Users/quentin.ambard@databricks.com/test_install_quentin", True, c['username'], c['pat_token'], c['url'], catalog='main', schema='quentin_test3', cloud="AWS", start_cluster = False)
#dbdemos.install("pipeline-bike", "/Users/quentin.ambard@databricks.com/test_install_quentin", True, c['username'], c['pat_token'], c['url'], catalog='main', schema='quentin_test3', cloud="AWS", start_cluster = False)

#dbdemos.install("feature-store", "/Users/quentin.ambard@databricks.com/test_install_quentin", True, c['username'], c['pat_token'], c['url'], cloud="AWS", use_current_cluster=False, current_cluster_id=c["current_cluster_id"])
dbdemos.install("feature-store", "/Users/quentin.ambard@databricks.com/test_install_quentin", True, c['username'], c['pat_token'], c['url'], cloud="AWS", use_current_cluster=False, current_cluster_id=c["current_cluster_id"])
#dbdemos.install("delta-lake", "/Users/quentin.ambard@databricks.com/test_install_quentin", True, c['username'], c['pat_token'], c['url'], cloud="GCP")
#dbdemos.install("delta-lake", "/Users/quentin.ambard@databricks.com/test_install_quentin", True, c['username'], c['pat_token'], c['url'], cloud="Azure", use_current_cluster=True, current_cluster_id=c["current_cluster_id"])
#dbdemos.install("mlops-end2end", "/Users/quentin.ambard@databricks.com/test_install_quentin", True, c['username'], c['pat_token'], c['url'], cloud="AWS", skip_dashboards=True, schema='test_quentin_rag', catalog='dbdemos')
dbdemos.install("mlops-end2end", "/Users/quentin.ambard@databricks.com/test_install_quentin", True, c['username'], c['pat_token'], c['url'], cloud="AWS", skip_dashboards=True, schema='test_quentin_rag', catalog='dbdemos')
#dbdemos.install("pandas-on-spark", "/Users/quentin.ambard@databricks.com/test_install_quentin", True, c['username'], c['pat_token'], c['url'], cloud="AWS")
#dbdemos.install("delta-sharing-airlines", "/Users/quentin.ambard@databricks.com/test_install_quentin", True, c['username'], c['pat_token'], c['url'])
#dbdemos.install("dlt-loans", "/Users/quentin.ambard@databricks.com/test_install_quentin", True, c['username'], c['pat_token'], c['url'])
Expand Down
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
#this will be the package name you will see, e.g. the output of 'conda list' in anaconda prompt
name = 'dbdemos',
#some version number you may wish to add - increment this after every update
version='0.6.28',
version='0.6.34',
author="Databricks",
author_email=["quentin.ambard@databricks.com", "cal.reynolds@databricks.com"],
description="Install databricks demos: notebooks, Delta Live Table Pipeline, DBSQL Dashboards, ML Models etc.",
Expand Down