From 68baa3e4d184051cbbbb774a6b6bd1a677e27f4a Mon Sep 17 00:00:00 2001 From: Noah Luna <15202580+ngrayluna@users.noreply.github.com> Date: Mon, 20 Apr 2026 11:53:39 -0700 Subject: [PATCH 1/2] wip --- colabs/wandb_registry/zoo_wandb.ipynb | 405 ++++++++++++++++++++------ 1 file changed, 323 insertions(+), 82 deletions(-) diff --git a/colabs/wandb_registry/zoo_wandb.ipynb b/colabs/wandb_registry/zoo_wandb.ipynb index 4f61a044..3c0b1342 100644 --- a/colabs/wandb_registry/zoo_wandb.ipynb +++ b/colabs/wandb_registry/zoo_wandb.ipynb @@ -50,17 +50,62 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 1, "id": "b87ee0c6-4e88-4c77-9cf5-dde109bb7ebd", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Requirement already satisfied: wandb in /Users/noahluna/.pyenv/versions/3.10.0/envs/wandb_basic/lib/python3.10/site-packages (0.25.1)\n", + "Requirement already satisfied: torch in /Users/noahluna/.pyenv/versions/3.10.0/envs/wandb_basic/lib/python3.10/site-packages (2.9.1)\n", + "Requirement already satisfied: ucimlrepo in /Users/noahluna/.pyenv/versions/3.10.0/envs/wandb_basic/lib/python3.10/site-packages (0.0.7)\n", + "Requirement already satisfied: scikit-learn in /Users/noahluna/.pyenv/versions/3.10.0/envs/wandb_basic/lib/python3.10/site-packages (1.7.2)\n", + "Requirement already satisfied: pandas in /Users/noahluna/.pyenv/versions/3.10.0/envs/wandb_basic/lib/python3.10/site-packages (2.3.2)\n", + "Requirement already satisfied: click>=8.0.1 in /Users/noahluna/.pyenv/versions/3.10.0/envs/wandb_basic/lib/python3.10/site-packages (from wandb) (8.1.8)\n", + "Requirement already satisfied: gitpython!=3.1.29,>=1.0.0 in /Users/noahluna/.pyenv/versions/3.10.0/envs/wandb_basic/lib/python3.10/site-packages (from wandb) (3.1.43)\n", + "Requirement already satisfied: packaging in /Users/noahluna/.pyenv/versions/3.10.0/envs/wandb_basic/lib/python3.10/site-packages (from wandb) (24.2)\n", + "Requirement already satisfied: platformdirs in /Users/noahluna/.pyenv/versions/3.10.0/envs/wandb_basic/lib/python3.10/site-packages (from wandb) (4.3.6)\n", + "Requirement already satisfied: protobuf!=5.28.0,!=5.29.0,<7,>4.21.0 in /Users/noahluna/.pyenv/versions/3.10.0/envs/wandb_basic/lib/python3.10/site-packages (from wandb) (5.29.3)\n", + "Requirement already satisfied: pydantic<3 in /Users/noahluna/.pyenv/versions/3.10.0/envs/wandb_basic/lib/python3.10/site-packages (from wandb) (2.10.6)\n", + "Requirement already satisfied: pyyaml in /Users/noahluna/.pyenv/versions/3.10.0/envs/wandb_basic/lib/python3.10/site-packages (from wandb) (6.0.2)\n", + "Requirement already satisfied: requests<3,>=2.0.0 in /Users/noahluna/.pyenv/versions/3.10.0/envs/wandb_basic/lib/python3.10/site-packages (from wandb) (2.32.3)\n", + "Requirement already satisfied: sentry-sdk>=2.0.0 in /Users/noahluna/.pyenv/versions/3.10.0/envs/wandb_basic/lib/python3.10/site-packages (from wandb) (2.18.0)\n", + "Requirement already satisfied: typing-extensions<5,>=4.8 in /Users/noahluna/.pyenv/versions/3.10.0/envs/wandb_basic/lib/python3.10/site-packages (from wandb) (4.12.2)\n", + "Requirement already satisfied: annotated-types>=0.6.0 in /Users/noahluna/.pyenv/versions/3.10.0/envs/wandb_basic/lib/python3.10/site-packages (from pydantic<3->wandb) (0.7.0)\n", + "Requirement already satisfied: pydantic-core==2.27.2 in /Users/noahluna/.pyenv/versions/3.10.0/envs/wandb_basic/lib/python3.10/site-packages (from pydantic<3->wandb) (2.27.2)\n", + "Requirement already satisfied: charset-normalizer<4,>=2 in /Users/noahluna/.pyenv/versions/3.10.0/envs/wandb_basic/lib/python3.10/site-packages (from requests<3,>=2.0.0->wandb) (3.4.0)\n", + "Requirement already satisfied: idna<4,>=2.5 in /Users/noahluna/.pyenv/versions/3.10.0/envs/wandb_basic/lib/python3.10/site-packages (from requests<3,>=2.0.0->wandb) (3.10)\n", + "Requirement already satisfied: urllib3<3,>=1.21.1 in /Users/noahluna/.pyenv/versions/3.10.0/envs/wandb_basic/lib/python3.10/site-packages (from requests<3,>=2.0.0->wandb) (2.3.0)\n", + "Requirement already satisfied: certifi>=2017.4.17 in /Users/noahluna/.pyenv/versions/3.10.0/envs/wandb_basic/lib/python3.10/site-packages (from requests<3,>=2.0.0->wandb) (2024.8.30)\n", + "Requirement already satisfied: filelock in /Users/noahluna/.pyenv/versions/3.10.0/envs/wandb_basic/lib/python3.10/site-packages (from torch) (3.18.0)\n", + "Requirement already satisfied: sympy>=1.13.3 in /Users/noahluna/.pyenv/versions/3.10.0/envs/wandb_basic/lib/python3.10/site-packages (from torch) (1.14.0)\n", + "Requirement already satisfied: networkx>=2.5.1 in /Users/noahluna/.pyenv/versions/3.10.0/envs/wandb_basic/lib/python3.10/site-packages (from torch) (3.4.2)\n", + "Requirement already satisfied: jinja2 in /Users/noahluna/.pyenv/versions/3.10.0/envs/wandb_basic/lib/python3.10/site-packages (from torch) (3.1.4)\n", + "Requirement already satisfied: fsspec>=0.8.5 in /Users/noahluna/.pyenv/versions/3.10.0/envs/wandb_basic/lib/python3.10/site-packages (from torch) (2024.10.0)\n", + "Requirement already satisfied: numpy>=1.22.0 in /Users/noahluna/.pyenv/versions/3.10.0/envs/wandb_basic/lib/python3.10/site-packages (from scikit-learn) (2.2.3)\n", + "Requirement already satisfied: scipy>=1.8.0 in /Users/noahluna/.pyenv/versions/3.10.0/envs/wandb_basic/lib/python3.10/site-packages (from scikit-learn) (1.14.1)\n", + "Requirement already satisfied: joblib>=1.2.0 in /Users/noahluna/.pyenv/versions/3.10.0/envs/wandb_basic/lib/python3.10/site-packages (from scikit-learn) (1.4.2)\n", + "Requirement already satisfied: threadpoolctl>=3.1.0 in /Users/noahluna/.pyenv/versions/3.10.0/envs/wandb_basic/lib/python3.10/site-packages (from scikit-learn) (3.5.0)\n", + "Requirement already satisfied: python-dateutil>=2.8.2 in /Users/noahluna/.pyenv/versions/3.10.0/envs/wandb_basic/lib/python3.10/site-packages (from pandas) (2.9.0.post0)\n", + "Requirement already satisfied: pytz>=2020.1 in /Users/noahluna/.pyenv/versions/3.10.0/envs/wandb_basic/lib/python3.10/site-packages (from pandas) (2024.2)\n", + "Requirement already satisfied: tzdata>=2022.7 in /Users/noahluna/.pyenv/versions/3.10.0/envs/wandb_basic/lib/python3.10/site-packages (from pandas) (2024.2)\n", + "Requirement already satisfied: gitdb<5,>=4.0.1 in /Users/noahluna/.pyenv/versions/3.10.0/envs/wandb_basic/lib/python3.10/site-packages (from gitpython!=3.1.29,>=1.0.0->wandb) (4.0.11)\n", + "Requirement already satisfied: smmap<6,>=3.0.1 in /Users/noahluna/.pyenv/versions/3.10.0/envs/wandb_basic/lib/python3.10/site-packages (from gitdb<5,>=4.0.1->gitpython!=3.1.29,>=1.0.0->wandb) (5.0.1)\n", + "Requirement already satisfied: six>=1.5 in /Users/noahluna/.pyenv/versions/3.10.0/envs/wandb_basic/lib/python3.10/site-packages (from python-dateutil>=2.8.2->pandas) (1.16.0)\n", + "Requirement already satisfied: mpmath<1.4,>=1.1.0 in /Users/noahluna/.pyenv/versions/3.10.0/envs/wandb_basic/lib/python3.10/site-packages (from sympy>=1.13.3->torch) (1.3.0)\n", + "Requirement already satisfied: MarkupSafe>=2.0 in /Users/noahluna/.pyenv/versions/3.10.0/envs/wandb_basic/lib/python3.10/site-packages (from jinja2->torch) (3.0.2)\n" + ] + } + ], "source": [ "!pip install wandb torch ucimlrepo scikit-learn pandas" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 2, "id": "f41c6701-7375-4bb0-a70f-3fdc5bff1693", "metadata": {}, "outputs": [], @@ -88,7 +133,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 3, "id": "dda1fc39-c06f-468c-82ad-736ca764e105", "metadata": {}, "outputs": [], @@ -113,10 +158,19 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 4, "id": "8edb34ad-02df-4256-b5de-6058c2826305", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "features: (101, 16) type: \n", + "labels: (101, 1) type: \n" + ] + } + ], "source": [ "print(\"features: \", X.shape, \"type: \", type(X))\n", "print(\"labels: \", y.shape, \"type: \", type(y))" @@ -124,10 +178,170 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 5, "id": "dde2f7c4-7db7-4d09-9850-e7c6121ab775", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
hairfeatherseggsmilkairborneaquaticpredatortoothedbackbonebreathesvenomousfinslegstaildomesticcatsize
01001001111004001
11001000111004101
20010011110010100
31001001111004001
41001001111004101
\n", + "
" + ], + "text/plain": [ + " hair feathers eggs milk airborne aquatic predator toothed backbone \\\n", + "0 1 0 0 1 0 0 1 1 1 \n", + "1 1 0 0 1 0 0 0 1 1 \n", + "2 0 0 1 0 0 1 1 1 1 \n", + "3 1 0 0 1 0 0 1 1 1 \n", + "4 1 0 0 1 0 0 1 1 1 \n", + "\n", + " breathes venomous fins legs tail domestic catsize \n", + "0 1 0 0 4 0 0 1 \n", + "1 1 0 0 4 1 0 1 \n", + "2 0 0 1 0 1 0 0 \n", + "3 1 0 0 4 0 0 1 \n", + "4 1 0 0 4 1 0 1 " + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "X.head(5)" ] @@ -144,10 +358,19 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 6, "id": "3ff90527-d818-4678-bf01-3efe4c8c58a7", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "dataset: torch.Size([101, 16]) dtype: torch.float32\n", + "labels: torch.Size([101, 1]) dtype: torch.int64\n" + ] + } + ], "source": [ "# Data type of the data must match the data type of the model, the default dtype for nn.Linear is torch.float32\n", "dataset = torch.tensor(X.values).type(torch.float32) \n", @@ -169,7 +392,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 7, "id": "46dacfb7-7493-4d36-8167-212f8b51af7c", "metadata": {}, "outputs": [], @@ -183,9 +406,21 @@ "id": "44c2b1fb-a1af-4311-aae9-86497d00ffda", "metadata": {}, "source": [ + "## Create a registry\n", + "Let's save our dataset to the W&B Registry as an artifact so that future teammates can reproduce our experiments, analyze the data, etc. at a future date. To do this, let's first create a *registry*:\n", + "\n", + "1. Navigate to your organization's Registry App\n", + "2. Click on the **Create registry** button\n", + "3. Provide a name for your registry in the **Name** field. For this notebook, we use \"Zoo\".\n", + "4. Select \"Organization\" from the **Registry visibility** dropdown.\n", + "5. Select \"All types\" from the **Accepted artifact types** dropdown. \n", + "6. Click **Create registry**.\n", + "\n", + "Now that you have a registry created, let's add our dataset to our registry.\n", + "\n", "## Track and publish dataset \n", "\n", - "Within the Dataset registry we will create a collection called \"zoo-dataset-tensors\". A *collection* is a set of linked artifact versions in a registry. \n", + "Within the \"Zoo\" registry we will create a collection called \"zoo-dataset-tensors\". A *collection* is a set of linked artifact versions in a registry. \n", "\n", "To create a collection we need to do two things:\n", "1. Specify the collection and registry we want to link our artifact version to. To do this, we specify a \"target path\" for our artifact version.\n", @@ -210,7 +445,7 @@ "source": [ "### Publish dataset to registry\n", "\n", - "Let's publish our dataset to the Dataset registry in a collection called \"zoo-dataset-tensors\". To do this, we will \n", + "Let's publish our dataset to the \"Zoo\" registry in a collection called \"zoo-dataset-tensors\". To do this, we will \n", "\n", "1. Get or create the target path. (For this notebook we will programmatically create the target path).\n", "1. Initialize a run\n", @@ -228,7 +463,7 @@ "metadata": {}, "outputs": [], "source": [ - "REGISTRY_NAME = \"Dataset\"\n", + "REGISTRY_NAME = \"Zoo\"\n", "COLLECTION_NAME = \"zoo-dataset-tensors\"\n", "\n", "# Path to link the artifact to a collection\n", @@ -240,7 +475,7 @@ "id": "66d03210-1de4-44a4-9773-67b4f5d71dfc", "metadata": {}, "source": [ - "Now that we have the target path, let's publish the dataset to the \"Dataset\" registry. In the following code cell, ensure to replace the values enclosed in `<>` with your team's entity:" + "Now that we have the target path, let's publish the dataset to the \"Zoo\" Registry. In the following code cell, ensure to replace the values enclosed in `<>` with your team's entity:" ] }, { @@ -325,7 +560,7 @@ "id": "028d0992-15a9-4687-8c3b-4218bf70adb9", "metadata": {}, "source": [ - "Next, let's publish this dataset into a different collection within the Dataset registry called \"zoo-dataset-tensors-split\":" + "Next, let's publish this dataset into a different collection within the registry called \"zoo-dataset-tensors-split\":" ] }, { @@ -357,7 +592,6 @@ "artifact.add_file(local_path=\"zoo_labels_y_test.pt\", name=\"zoo_labels_y_test\")\n", "\n", "# Create a target path for our artifact in the registry\n", - "REGISTRY_NAME = \"Dataset\"\n", "COLLECTION_NAME = \"zoo-dataset-tensors-split\"\n", "target_dataset_path=f\"wandb-registry-{REGISTRY_NAME}/{COLLECTION_NAME}\"\n", "\n", @@ -374,7 +608,7 @@ "We can verify we correctly linked our artifact to our desired collection and registry with W&B App UI: \n", "\n", "1. Navigate to the Registry App\n", - "2. Select on the Dataset registry\n", + "2. Select the \"Zoo\" Registry\n", "3. Click **View details** \"zoo-dataset-tensors-split\" collection\n", "4. Click the **View** button next to the artifact version\n", "5. Select the **Files** tab\n", @@ -458,7 +692,7 @@ "\n", "Next, let's train a model using the training data we published to the registry earlier in this notebook. After we train the model, we will publish that model to W&B.\n", "\n", - "To do this, let's first get the artifact we published to the \"Dataset\" registry. To retrieve an artifact from a registry, we need to know the name of that artifact. The name of an artifact in a registry consists of the prefix `wandb-registry-`, the name of the registry, the name of the collection, and the artifact version:\n", + "To do this, let's first get the artifact we published to the \"Zoo\" registry. To retrieve an artifact from a registry, we need to know the name of that artifact. The name of an artifact in a registry consists of the prefix `wandb-registry-`, the name of the registry, the name of the collection, and the artifact version:\n", "\n", "```python\n", "# Artifact name/filepath for downloading and using artifacts published to a registry\n", @@ -482,66 +716,64 @@ "metadata": {}, "outputs": [], "source": [ - "run = wandb.init(entity = TEAM_ENTITY, project = PROJECT, job_type = \"training\", config = hyperparameter_config)\n", + "with wandb.init(entity = TEAM_ENTITY, project = PROJECT, job_type = \"training\", config = hyperparameter_config) as run:\n", "\n", - "# Get dataset artifacts from registry\n", - "VERSION = 0\n", - "artifact_name = f\"wandb-registry-{REGISTRY_NAME.lower()}/{COLLECTION_NAME}:v{VERSION}\"\n", - "dataset_artifact = run.use_artifact(artifact_or_name=artifact_name)\n", - "\n", - "# Download only the training data\n", - "X_train_path = dataset_artifact.download(path_prefix=\"zoo_dataset_X_train\")\n", - "y_train_path = dataset_artifact.download(path_prefix=\"zoo_labels_y_train\")\n", - "\n", - "# Load data as tensors \n", - "X_train = torch.load(f=X_train_path+\"/zoo_dataset_X_train\")\n", - "y_train = torch.load(f=y_train_path+\"/zoo_labels_y_train\")\n", - "\n", - "# Set initial dummy loss value to compare to in training loop\n", - "prev_best_loss = 1e10 \n", - "\n", - "# Training loop\n", - "for e in range(hyperparameter_config[\"epochs\"] + 1):\n", - " pred = model(X_train)\n", - " loss = loss_fn(pred, y_train.squeeze(1))\n", - " \n", - " loss.backward()\n", - " optimizer.step()\n", - " optimizer.zero_grad()\n", - "\n", - " wandb.log({\n", - " \"train/epoch_ndx\": e,\n", - " \"train/train_loss\": loss\n", - " })\n", - "\n", - " # Checkpoint/save model if loss improves\n", - " if (e % 100 == 0) and (loss <= prev_best_loss):\n", - " print(\"epoch: \", e, \"loss:\", loss.item())\n", - " \n", - " PATH = 'zoo_wandb.pth' \n", - " torch.save(model.state_dict(), PATH)\n", - "\n", - " model_artifact_name = f\"zoo-{wandb.run.id}\"\n", - " artifact = wandb.Artifact(\n", - " name=model_artifact_name,\n", - " type=\"model\",\n", - " metadata={\n", - " \"num_classes\": 7,\n", - " \"model_type\": wandb.config[\"model_type\"]\n", - " }\n", - " )\n", + " # Get dataset artifacts from registry\n", + " VERSION = 0\n", + " artifact_name = f\"wandb-registry-{REGISTRY_NAME.lower()}/{COLLECTION_NAME}:v{VERSION}\"\n", + " dataset_artifact = run.use_artifact(artifact_or_name=artifact_name)\n", "\n", + " # Download only the training data\n", + " X_train_path = dataset_artifact.download(path_prefix=\"zoo_dataset_X_train\")\n", + " y_train_path = dataset_artifact.download(path_prefix=\"zoo_labels_y_train\")\n", "\n", - " # Store new best loss\n", - " prev_best_loss = loss\n", + " # Load data as tensors \n", + " X_train = torch.load(f=X_train_path+\"/zoo_dataset_X_train\")\n", + " y_train = torch.load(f=y_train_path+\"/zoo_labels_y_train\")\n", "\n", - "print(f'Saving model artifact {model_artifact_name}')\n", + " # Set initial dummy loss value to compare to in training loop\n", + " prev_best_loss = 1e10 \n", "\n", - "# Add saved model to artifact\n", - "artifact.add_file(PATH)\n", - "artifact.save()\n", + " # Training loop\n", + " for e in range(hyperparameter_config[\"epochs\"] + 1):\n", + " pred = model(X_train)\n", + " loss = loss_fn(pred, y_train.squeeze(1))\n", + " \n", + " loss.backward()\n", + " optimizer.step()\n", + " optimizer.zero_grad()\n", "\n", - "run.finish()" + " run.log({\n", + " \"train/epoch_ndx\": e,\n", + " \"train/train_loss\": loss\n", + " })\n", + "\n", + " # Checkpoint/save model if loss improves\n", + " if (e % 100 == 0) and (loss <= prev_best_loss):\n", + " print(\"epoch: \", e, \"loss:\", loss.item())\n", + " \n", + " PATH = 'zoo_wandb.pth' \n", + " torch.save(model.state_dict(), PATH)\n", + "\n", + " model_artifact_name = f\"zoo-{wandb.run.id}\"\n", + " artifact = wandb.Artifact(\n", + " name=model_artifact_name,\n", + " type=\"model\",\n", + " metadata={\n", + " \"num_classes\": 7,\n", + " \"model_type\": wandb.config[\"model_type\"]\n", + " }\n", + " )\n", + "\n", + "\n", + " # Store new best loss\n", + " prev_best_loss = loss\n", + "\n", + " print(f'Saving model artifact {model_artifact_name}')\n", + "\n", + " # Add saved model to artifact\n", + " artifact.add_file(PATH)\n", + " artifact.save()" ] }, { @@ -551,7 +783,7 @@ "source": [ "The preceding cell might look intimidating. Let's break it down:\n", "\n", - "* First, we download the dataset from the Dataset registry and load it as a tensor\n", + "* First, we download the dataset from the registry and load it as a tensor\n", "* Next, we create a simple training loop\n", " * Within the training loop we log the loss for each step\n", " * We checkpoint(save) the model every time the remainder of the epoch divided by 100 is 0 and the loss is lower than the previously recorded loss.\n", @@ -568,7 +800,7 @@ "metadata": {}, "source": [ "## Publish model to registry\n", - "Let's make this model artifact available to other users in our organization. To do this, we will create a collection within the Model registry.\n", + "Let's make this model artifact available to other users in our organization. To do this, we will create a collection within our registry.\n", "\n", "To create a collection within a registry, we need to know the full name of the artifact. The full name of the artifact consists of the name we provided to it when we created the Artifact object and its location within our team's project.\n", "\n", @@ -606,9 +838,9 @@ "id": "6aaa9221-f6ac-454b-9a48-47597f47a572", "metadata": {}, "source": [ - "Now that we have the full name of our model artifact. Let's publish it to the model registry.\n", + "Now that we have the full name of our model artifact. Let's publish it to the registry.\n", "\n", - "Similar to how we created a target path when we published our dataset artifact to the Dataset registry, let's create the target path for our model artifact. The target path tells W&B the collection and registry (Model registry) to link our artifact version to. \n", + "Similar to how we created a target path when we published our dataset artifact to the registry, let's create the target path for our model artifact. The target path tells W&B the collection and registry (Zoo registry) to link our artifact version to. \n", "\n", "As a reminder, the target path to link an artifact to a registry consists of:\n", "\n", @@ -625,7 +857,6 @@ "metadata": {}, "outputs": [], "source": [ - "REGISTRY_NAME = \"Model\"\n", "COLLECTION_NAME = \"Zoo_Classifier_Models\"\n", "\n", "target_path = f\"wandb-registry-{REGISTRY_NAME}/{COLLECTION_NAME}\"\n", @@ -731,7 +962,6 @@ "outputs": [], "source": [ "# Create model artifact name\n", - "REGISTRY_NAME = \"model\"\n", "COLLECTION_NAME = \"Zoo_Classifier_Models\"\n", "VERSION = 0\n", "\n", @@ -802,9 +1032,9 @@ "id": "93895213-e75e-46e6-b3a8-8017ccaff9e2", "metadata": {}, "source": [ - "### Get test dataset from Dataset registry\n", + "### Get test dataset from the registry\n", "\n", - "Let's get the test dataset from our registry. Similar to the above code block, we will specify the full name of the artifact version we want from our Dataset registry." + "Let's get the test dataset from our registry. Similar to the above code block, we will specify the full name of the artifact version we want from our registry." ] }, { @@ -815,7 +1045,6 @@ "outputs": [], "source": [ "# Create dataset artifact name\n", - "REGISTRY_NAME = \"dataset\"\n", "COLLECTION_NAME = \"zoo-dataset-tensors-split\"\n", "VERSION = 0\n", "\n", @@ -1033,6 +1262,18 @@ "kernelspec": { "display_name": "Python 3", "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.0" } }, "nbformat": 4, From 0068c65eead5e5e543af46416bb58a7e87a91630 Mon Sep 17 00:00:00 2001 From: Noah Luna <15202580+ngrayluna@users.noreply.github.com> Date: Mon, 20 Apr 2026 13:24:56 -0700 Subject: [PATCH 2/2] remove metadata --- colabs/wandb_registry/zoo_wandb.ipynb | 372 ++++---------------------- 1 file changed, 56 insertions(+), 316 deletions(-) diff --git a/colabs/wandb_registry/zoo_wandb.ipynb b/colabs/wandb_registry/zoo_wandb.ipynb index 3c0b1342..08f74174 100644 --- a/colabs/wandb_registry/zoo_wandb.ipynb +++ b/colabs/wandb_registry/zoo_wandb.ipynb @@ -2,6 +2,7 @@ "cells": [ { "cell_type": "markdown", + "id": "acc18434", "metadata": {}, "source": [ "\"Open\n", @@ -50,62 +51,17 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": null, "id": "b87ee0c6-4e88-4c77-9cf5-dde109bb7ebd", "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Requirement already satisfied: wandb in /Users/noahluna/.pyenv/versions/3.10.0/envs/wandb_basic/lib/python3.10/site-packages (0.25.1)\n", - "Requirement already satisfied: torch in /Users/noahluna/.pyenv/versions/3.10.0/envs/wandb_basic/lib/python3.10/site-packages (2.9.1)\n", - "Requirement already satisfied: ucimlrepo in /Users/noahluna/.pyenv/versions/3.10.0/envs/wandb_basic/lib/python3.10/site-packages (0.0.7)\n", - "Requirement already satisfied: scikit-learn in /Users/noahluna/.pyenv/versions/3.10.0/envs/wandb_basic/lib/python3.10/site-packages (1.7.2)\n", - "Requirement already satisfied: pandas in /Users/noahluna/.pyenv/versions/3.10.0/envs/wandb_basic/lib/python3.10/site-packages (2.3.2)\n", - "Requirement already satisfied: click>=8.0.1 in /Users/noahluna/.pyenv/versions/3.10.0/envs/wandb_basic/lib/python3.10/site-packages (from wandb) (8.1.8)\n", - "Requirement already satisfied: gitpython!=3.1.29,>=1.0.0 in /Users/noahluna/.pyenv/versions/3.10.0/envs/wandb_basic/lib/python3.10/site-packages (from wandb) (3.1.43)\n", - "Requirement already satisfied: packaging in /Users/noahluna/.pyenv/versions/3.10.0/envs/wandb_basic/lib/python3.10/site-packages (from wandb) (24.2)\n", - "Requirement already satisfied: platformdirs in /Users/noahluna/.pyenv/versions/3.10.0/envs/wandb_basic/lib/python3.10/site-packages (from wandb) (4.3.6)\n", - "Requirement already satisfied: protobuf!=5.28.0,!=5.29.0,<7,>4.21.0 in /Users/noahluna/.pyenv/versions/3.10.0/envs/wandb_basic/lib/python3.10/site-packages (from wandb) (5.29.3)\n", - "Requirement already satisfied: pydantic<3 in /Users/noahluna/.pyenv/versions/3.10.0/envs/wandb_basic/lib/python3.10/site-packages (from wandb) (2.10.6)\n", - "Requirement already satisfied: pyyaml in /Users/noahluna/.pyenv/versions/3.10.0/envs/wandb_basic/lib/python3.10/site-packages (from wandb) (6.0.2)\n", - "Requirement already satisfied: requests<3,>=2.0.0 in /Users/noahluna/.pyenv/versions/3.10.0/envs/wandb_basic/lib/python3.10/site-packages (from wandb) (2.32.3)\n", - "Requirement already satisfied: sentry-sdk>=2.0.0 in /Users/noahluna/.pyenv/versions/3.10.0/envs/wandb_basic/lib/python3.10/site-packages (from wandb) (2.18.0)\n", - "Requirement already satisfied: typing-extensions<5,>=4.8 in /Users/noahluna/.pyenv/versions/3.10.0/envs/wandb_basic/lib/python3.10/site-packages (from wandb) (4.12.2)\n", - "Requirement already satisfied: annotated-types>=0.6.0 in /Users/noahluna/.pyenv/versions/3.10.0/envs/wandb_basic/lib/python3.10/site-packages (from pydantic<3->wandb) (0.7.0)\n", - "Requirement already satisfied: pydantic-core==2.27.2 in /Users/noahluna/.pyenv/versions/3.10.0/envs/wandb_basic/lib/python3.10/site-packages (from pydantic<3->wandb) (2.27.2)\n", - "Requirement already satisfied: charset-normalizer<4,>=2 in /Users/noahluna/.pyenv/versions/3.10.0/envs/wandb_basic/lib/python3.10/site-packages (from requests<3,>=2.0.0->wandb) (3.4.0)\n", - "Requirement already satisfied: idna<4,>=2.5 in /Users/noahluna/.pyenv/versions/3.10.0/envs/wandb_basic/lib/python3.10/site-packages (from requests<3,>=2.0.0->wandb) (3.10)\n", - "Requirement already satisfied: urllib3<3,>=1.21.1 in /Users/noahluna/.pyenv/versions/3.10.0/envs/wandb_basic/lib/python3.10/site-packages (from requests<3,>=2.0.0->wandb) (2.3.0)\n", - "Requirement already satisfied: certifi>=2017.4.17 in /Users/noahluna/.pyenv/versions/3.10.0/envs/wandb_basic/lib/python3.10/site-packages (from requests<3,>=2.0.0->wandb) (2024.8.30)\n", - "Requirement already satisfied: filelock in /Users/noahluna/.pyenv/versions/3.10.0/envs/wandb_basic/lib/python3.10/site-packages (from torch) (3.18.0)\n", - "Requirement already satisfied: sympy>=1.13.3 in /Users/noahluna/.pyenv/versions/3.10.0/envs/wandb_basic/lib/python3.10/site-packages (from torch) (1.14.0)\n", - "Requirement already satisfied: networkx>=2.5.1 in /Users/noahluna/.pyenv/versions/3.10.0/envs/wandb_basic/lib/python3.10/site-packages (from torch) (3.4.2)\n", - "Requirement already satisfied: jinja2 in /Users/noahluna/.pyenv/versions/3.10.0/envs/wandb_basic/lib/python3.10/site-packages (from torch) (3.1.4)\n", - "Requirement already satisfied: fsspec>=0.8.5 in /Users/noahluna/.pyenv/versions/3.10.0/envs/wandb_basic/lib/python3.10/site-packages (from torch) (2024.10.0)\n", - "Requirement already satisfied: numpy>=1.22.0 in /Users/noahluna/.pyenv/versions/3.10.0/envs/wandb_basic/lib/python3.10/site-packages (from scikit-learn) (2.2.3)\n", - "Requirement already satisfied: scipy>=1.8.0 in /Users/noahluna/.pyenv/versions/3.10.0/envs/wandb_basic/lib/python3.10/site-packages (from scikit-learn) (1.14.1)\n", - "Requirement already satisfied: joblib>=1.2.0 in /Users/noahluna/.pyenv/versions/3.10.0/envs/wandb_basic/lib/python3.10/site-packages (from scikit-learn) (1.4.2)\n", - "Requirement already satisfied: threadpoolctl>=3.1.0 in /Users/noahluna/.pyenv/versions/3.10.0/envs/wandb_basic/lib/python3.10/site-packages (from scikit-learn) (3.5.0)\n", - "Requirement already satisfied: python-dateutil>=2.8.2 in /Users/noahluna/.pyenv/versions/3.10.0/envs/wandb_basic/lib/python3.10/site-packages (from pandas) (2.9.0.post0)\n", - "Requirement already satisfied: pytz>=2020.1 in /Users/noahluna/.pyenv/versions/3.10.0/envs/wandb_basic/lib/python3.10/site-packages (from pandas) (2024.2)\n", - "Requirement already satisfied: tzdata>=2022.7 in /Users/noahluna/.pyenv/versions/3.10.0/envs/wandb_basic/lib/python3.10/site-packages (from pandas) (2024.2)\n", - "Requirement already satisfied: gitdb<5,>=4.0.1 in /Users/noahluna/.pyenv/versions/3.10.0/envs/wandb_basic/lib/python3.10/site-packages (from gitpython!=3.1.29,>=1.0.0->wandb) (4.0.11)\n", - "Requirement already satisfied: smmap<6,>=3.0.1 in /Users/noahluna/.pyenv/versions/3.10.0/envs/wandb_basic/lib/python3.10/site-packages (from gitdb<5,>=4.0.1->gitpython!=3.1.29,>=1.0.0->wandb) (5.0.1)\n", - "Requirement already satisfied: six>=1.5 in /Users/noahluna/.pyenv/versions/3.10.0/envs/wandb_basic/lib/python3.10/site-packages (from python-dateutil>=2.8.2->pandas) (1.16.0)\n", - "Requirement already satisfied: mpmath<1.4,>=1.1.0 in /Users/noahluna/.pyenv/versions/3.10.0/envs/wandb_basic/lib/python3.10/site-packages (from sympy>=1.13.3->torch) (1.3.0)\n", - "Requirement already satisfied: MarkupSafe>=2.0 in /Users/noahluna/.pyenv/versions/3.10.0/envs/wandb_basic/lib/python3.10/site-packages (from jinja2->torch) (3.0.2)\n" - ] - } - ], + "outputs": [], "source": [ "!pip install wandb torch ucimlrepo scikit-learn pandas" ] }, { "cell_type": "code", - "execution_count": 2, + "execution_count": null, "id": "f41c6701-7375-4bb0-a70f-3fdc5bff1693", "metadata": {}, "outputs": [], @@ -133,7 +89,7 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": null, "id": "dda1fc39-c06f-468c-82ad-736ca764e105", "metadata": {}, "outputs": [], @@ -158,19 +114,10 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": null, "id": "8edb34ad-02df-4256-b5de-6058c2826305", "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "features: (101, 16) type: \n", - "labels: (101, 1) type: \n" - ] - } - ], + "outputs": [], "source": [ "print(\"features: \", X.shape, \"type: \", type(X))\n", "print(\"labels: \", y.shape, \"type: \", type(y))" @@ -178,170 +125,10 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": null, "id": "dde2f7c4-7db7-4d09-9850-e7c6121ab775", "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
hairfeatherseggsmilkairborneaquaticpredatortoothedbackbonebreathesvenomousfinslegstaildomesticcatsize
01001001111004001
11001000111004101
20010011110010100
31001001111004001
41001001111004101
\n", - "
" - ], - "text/plain": [ - " hair feathers eggs milk airborne aquatic predator toothed backbone \\\n", - "0 1 0 0 1 0 0 1 1 1 \n", - "1 1 0 0 1 0 0 0 1 1 \n", - "2 0 0 1 0 0 1 1 1 1 \n", - "3 1 0 0 1 0 0 1 1 1 \n", - "4 1 0 0 1 0 0 1 1 1 \n", - "\n", - " breathes venomous fins legs tail domestic catsize \n", - "0 1 0 0 4 0 0 1 \n", - "1 1 0 0 4 1 0 1 \n", - "2 0 0 1 0 1 0 0 \n", - "3 1 0 0 4 0 0 1 \n", - "4 1 0 0 4 1 0 1 " - ] - }, - "execution_count": 5, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "X.head(5)" ] @@ -358,19 +145,10 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": null, "id": "3ff90527-d818-4678-bf01-3efe4c8c58a7", "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "dataset: torch.Size([101, 16]) dtype: torch.float32\n", - "labels: torch.Size([101, 1]) dtype: torch.int64\n" - ] - } - ], + "outputs": [], "source": [ "# Data type of the data must match the data type of the model, the default dtype for nn.Linear is torch.float32\n", "dataset = torch.tensor(X.values).type(torch.float32) \n", @@ -392,7 +170,7 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": null, "id": "46dacfb7-7493-4d36-8167-212f8b51af7c", "metadata": {}, "outputs": [], @@ -488,24 +266,22 @@ "TEAM_ENTITY = \"\"\n", "PROJECT = \"zoo_experiment\"\n", "\n", - "run = wandb.init(\n", + "with wandb.init(\n", " entity=TEAM_ENTITY,\n", " project=PROJECT,\n", " job_type=\"publish_dataset\"\n", - ")\n", - "\n", - "artifact = wandb.Artifact(\n", - " name=\"zoo_dataset\",\n", - " type=\"dataset\", \n", - " description=\"Processed dataset and labels.\"\n", - ")\n", + ") as run:\n", "\n", - "artifact.add_file(local_path=\"zoo_dataset.pt\", name=\"zoo_dataset\")\n", - "artifact.add_file(local_path=\"zoo_labels.pt\", name=\"zoo_labels\")\n", + " artifact = wandb.Artifact(\n", + " name=\"zoo_dataset\",\n", + " type=\"dataset\", \n", + " description=\"Processed dataset and labels.\"\n", + " )\n", "\n", - "run.link_artifact(artifact=artifact, target_path=dataset_target_path)\n", + " artifact.add_file(local_path=\"zoo_dataset.pt\", name=\"zoo_dataset\")\n", + " artifact.add_file(local_path=\"zoo_labels.pt\", name=\"zoo_labels\")\n", "\n", - "run.finish()" + " run.link_artifact(artifact=artifact, target_path=dataset_target_path)" ] }, { @@ -570,34 +346,32 @@ "metadata": {}, "outputs": [], "source": [ - "run = wandb.init(\n", + "with wandb.init(\n", " entity=TEAM_ENTITY,\n", " project=PROJECT,\n", " job_type=\"publish_split_dataset\", \n", " config=config\n", - ")\n", - "\n", - "# Let's add a description to let others know which file to use in future experiments\n", - "artifact = wandb.Artifact(\n", - " name=\"split_zoo_dataset\",\n", - " type=\"dataset\", \n", - " description=\"Artifact contains `zoo_dataset` split into 4 datasets. \\\n", - " For training, use `zoo_dataset_X_train` and `zoo_labels_y_train`. \\\n", - " For testing, use `zoo_dataset_X_test` and `zoo_labels_y_test`.\"\n", - ")\n", + ") as run:\n", "\n", - "artifact.add_file(local_path=\"zoo_dataset_X_train.pt\", name=\"zoo_dataset_X_train\")\n", - "artifact.add_file(local_path=\"zoo_labels_y_train.pt\", name=\"zoo_labels_y_train\")\n", - "artifact.add_file(local_path=\"zoo_dataset_X_test.pt\", name=\"zoo_dataset_X_test\")\n", - "artifact.add_file(local_path=\"zoo_labels_y_test.pt\", name=\"zoo_labels_y_test\")\n", + " # Let's add a description to let others know which file to use in future experiments\n", + " artifact = wandb.Artifact(\n", + " name=\"split_zoo_dataset\",\n", + " type=\"dataset\", \n", + " description=\"Artifact contains `zoo_dataset` split into 4 datasets. \\\n", + " For training, use `zoo_dataset_X_train` and `zoo_labels_y_train`. \\\n", + " For testing, use `zoo_dataset_X_test` and `zoo_labels_y_test`.\"\n", + " )\n", "\n", - "# Create a target path for our artifact in the registry\n", - "COLLECTION_NAME = \"zoo-dataset-tensors-split\"\n", - "target_dataset_path=f\"wandb-registry-{REGISTRY_NAME}/{COLLECTION_NAME}\"\n", + " artifact.add_file(local_path=\"zoo_dataset_X_train.pt\", name=\"zoo_dataset_X_train\")\n", + " artifact.add_file(local_path=\"zoo_labels_y_train.pt\", name=\"zoo_labels_y_train\")\n", + " artifact.add_file(local_path=\"zoo_dataset_X_test.pt\", name=\"zoo_dataset_X_test\")\n", + " artifact.add_file(local_path=\"zoo_labels_y_test.pt\", name=\"zoo_labels_y_test\")\n", "\n", - "run.link_artifact(artifact=artifact, target_path=target_dataset_path)\n", + " # Create a target path for our artifact in the registry\n", + " COLLECTION_NAME = \"zoo-dataset-tensors-split\"\n", + " target_dataset_path=f\"wandb-registry-{REGISTRY_NAME}/{COLLECTION_NAME}\"\n", "\n", - "run.finish()" + " run.link_artifact(artifact=artifact, target_path=target_dataset_path)" ] }, { @@ -878,10 +652,9 @@ "metadata": {}, "outputs": [], "source": [ - "run = wandb.init(entity=TEAM_ENTITY, project=PROJECT)\n", - "model_artifact = run.use_artifact(artifact_or_name=artifact_name, type=\"model\")\n", - "run.link_artifact(artifact=model_artifact, target_path=target_path)\n", - "run.finish()" + "with wandb.init(entity=TEAM_ENTITY, project=PROJECT) as run:\n", + " model_artifact = run.use_artifact(artifact_or_name=artifact_name, type=\"model\")\n", + " run.link_artifact(artifact=model_artifact, target_path=target_path)" ] }, { @@ -990,9 +763,9 @@ "DIFFERENT_TEAM_ENTITY = \"\"\n", "DIFFERENT_PROJECT = \"Check_Zoo_Model\"\n", "\n", - "run = wandb.init(entity=DIFFERENT_TEAM_ENTITY, project=DIFFERENT_PROJECT)\n", - "registry_model = run.use_artifact(artifact_or_name=model_artifact_name)\n", - "local_model_path = registry_model.download()" + "with wandb.init(entity=DIFFERENT_TEAM_ENTITY, project=DIFFERENT_PROJECT) as run:\n", + " registry_model = run.use_artifact(artifact_or_name=model_artifact_name)\n", + " local_model_path = registry_model.download()" ] }, { @@ -1059,27 +832,17 @@ "metadata": {}, "outputs": [], "source": [ - "run = wandb.init(entity=DIFFERENT_TEAM_ENTITY, project=DIFFERENT_PROJECT)\n", - "dataset_artifact = run.use_artifact(artifact_or_name=data_artifact_name, type=\"dataset\")\n", - "local_dataset_path = dataset_artifact.download()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "77089d70-5248-4946-be78-ce4a87bb58ad", - "metadata": {}, - "outputs": [], - "source": [ - "# Test data and label filenames\n", - "test_data_filename = \"zoo_dataset_X_test\"\n", - "test_labels_filename = \"zoo_labels_y_test\" \n", + "with wandb.init(entity=DIFFERENT_TEAM_ENTITY, project=DIFFERENT_PROJECT) as run:\n", + " dataset_artifact = run.use_artifact(artifact_or_name=data_artifact_name, type=\"dataset\")\n", + " local_dataset_path = dataset_artifact.download()\n", "\n", - "# Load dataset and labels into notebook\n", - "loaded_data = torch.load(f\"{local_dataset_path}/{test_data_filename}\")\n", - "loaded_labels = torch.load(f\"{local_dataset_path}/{test_labels_filename}\")\n", + " # Test data and label filenames\n", + " test_data_filename = \"zoo_dataset_X_test\"\n", + " test_labels_filename = \"zoo_labels_y_test\" \n", "\n", - "run.finish()" + " # Load dataset and labels into notebook\n", + " loaded_data = torch.load(f\"{local_dataset_path}/{test_data_filename}\")\n", + " loaded_labels = torch.load(f\"{local_dataset_path}/{test_labels_filename}\")" ] }, { @@ -1252,30 +1015,7 @@ ] } ], - "metadata": { - "accelerator": "GPU", - "colab": { - "include_colab_link": true, - "provenance": [], - "toc_visible": true - }, - "kernelspec": { - "display_name": "Python 3", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.10.0" - } - }, + "metadata": {}, "nbformat": 4, "nbformat_minor": 5 }