diff --git a/02_activities/assignments/assignment1.ipynb b/02_activities/assignments/assignment1.ipynb
new file mode 100644
index 000000000..a75d3a268
--- /dev/null
+++ b/02_activities/assignments/assignment1.ipynb
@@ -0,0 +1,678 @@
+{
+ "cells": [
+ {
+ "cell_type": "code",
+ "execution_count": 2,
+ "id": "3f73d818",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Import standard libraries\n",
+ "import pandas as pd\n",
+ "import numpy as np\n",
+ "import random\n",
+ "import matplotlib.pyplot as plt\n",
+ "import matplotlib.colors as mcolors\n",
+ "from sklearn.preprocessing import StandardScaler\n",
+ "from sklearn.model_selection import train_test_split\n",
+ "from sklearn.neighbors import KNeighborsClassifier\n",
+ "from sklearn.metrics import recall_score, precision_score\n",
+ "from sklearn.model_selection import cross_validate\n",
+ "from sklearn.model_selection import GridSearchCV\n",
+ "from sklearn.metrics import accuracy_score"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 3,
+ "id": "bb80c709",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "
\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " alcohol | \n",
+ " malic_acid | \n",
+ " ash | \n",
+ " alcalinity_of_ash | \n",
+ " magnesium | \n",
+ " total_phenols | \n",
+ " flavanoids | \n",
+ " nonflavanoid_phenols | \n",
+ " proanthocyanins | \n",
+ " color_intensity | \n",
+ " hue | \n",
+ " od280/od315_of_diluted_wines | \n",
+ " proline | \n",
+ " class | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 0 | \n",
+ " 14.23 | \n",
+ " 1.71 | \n",
+ " 2.43 | \n",
+ " 15.6 | \n",
+ " 127.0 | \n",
+ " 2.80 | \n",
+ " 3.06 | \n",
+ " 0.28 | \n",
+ " 2.29 | \n",
+ " 5.64 | \n",
+ " 1.04 | \n",
+ " 3.92 | \n",
+ " 1065.0 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " | 1 | \n",
+ " 13.20 | \n",
+ " 1.78 | \n",
+ " 2.14 | \n",
+ " 11.2 | \n",
+ " 100.0 | \n",
+ " 2.65 | \n",
+ " 2.76 | \n",
+ " 0.26 | \n",
+ " 1.28 | \n",
+ " 4.38 | \n",
+ " 1.05 | \n",
+ " 3.40 | \n",
+ " 1050.0 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " | 2 | \n",
+ " 13.16 | \n",
+ " 2.36 | \n",
+ " 2.67 | \n",
+ " 18.6 | \n",
+ " 101.0 | \n",
+ " 2.80 | \n",
+ " 3.24 | \n",
+ " 0.30 | \n",
+ " 2.81 | \n",
+ " 5.68 | \n",
+ " 1.03 | \n",
+ " 3.17 | \n",
+ " 1185.0 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " | 3 | \n",
+ " 14.37 | \n",
+ " 1.95 | \n",
+ " 2.50 | \n",
+ " 16.8 | \n",
+ " 113.0 | \n",
+ " 3.85 | \n",
+ " 3.49 | \n",
+ " 0.24 | \n",
+ " 2.18 | \n",
+ " 7.80 | \n",
+ " 0.86 | \n",
+ " 3.45 | \n",
+ " 1480.0 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " | 4 | \n",
+ " 13.24 | \n",
+ " 2.59 | \n",
+ " 2.87 | \n",
+ " 21.0 | \n",
+ " 118.0 | \n",
+ " 2.80 | \n",
+ " 2.69 | \n",
+ " 0.39 | \n",
+ " 1.82 | \n",
+ " 4.32 | \n",
+ " 1.04 | \n",
+ " 2.93 | \n",
+ " 735.0 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " | ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ "
\n",
+ " \n",
+ " | 173 | \n",
+ " 13.71 | \n",
+ " 5.65 | \n",
+ " 2.45 | \n",
+ " 20.5 | \n",
+ " 95.0 | \n",
+ " 1.68 | \n",
+ " 0.61 | \n",
+ " 0.52 | \n",
+ " 1.06 | \n",
+ " 7.70 | \n",
+ " 0.64 | \n",
+ " 1.74 | \n",
+ " 740.0 | \n",
+ " 2 | \n",
+ "
\n",
+ " \n",
+ " | 174 | \n",
+ " 13.40 | \n",
+ " 3.91 | \n",
+ " 2.48 | \n",
+ " 23.0 | \n",
+ " 102.0 | \n",
+ " 1.80 | \n",
+ " 0.75 | \n",
+ " 0.43 | \n",
+ " 1.41 | \n",
+ " 7.30 | \n",
+ " 0.70 | \n",
+ " 1.56 | \n",
+ " 750.0 | \n",
+ " 2 | \n",
+ "
\n",
+ " \n",
+ " | 175 | \n",
+ " 13.27 | \n",
+ " 4.28 | \n",
+ " 2.26 | \n",
+ " 20.0 | \n",
+ " 120.0 | \n",
+ " 1.59 | \n",
+ " 0.69 | \n",
+ " 0.43 | \n",
+ " 1.35 | \n",
+ " 10.20 | \n",
+ " 0.59 | \n",
+ " 1.56 | \n",
+ " 835.0 | \n",
+ " 2 | \n",
+ "
\n",
+ " \n",
+ " | 176 | \n",
+ " 13.17 | \n",
+ " 2.59 | \n",
+ " 2.37 | \n",
+ " 20.0 | \n",
+ " 120.0 | \n",
+ " 1.65 | \n",
+ " 0.68 | \n",
+ " 0.53 | \n",
+ " 1.46 | \n",
+ " 9.30 | \n",
+ " 0.60 | \n",
+ " 1.62 | \n",
+ " 840.0 | \n",
+ " 2 | \n",
+ "
\n",
+ " \n",
+ " | 177 | \n",
+ " 14.13 | \n",
+ " 4.10 | \n",
+ " 2.74 | \n",
+ " 24.5 | \n",
+ " 96.0 | \n",
+ " 2.05 | \n",
+ " 0.76 | \n",
+ " 0.56 | \n",
+ " 1.35 | \n",
+ " 9.20 | \n",
+ " 0.61 | \n",
+ " 1.60 | \n",
+ " 560.0 | \n",
+ " 2 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
178 rows × 14 columns
\n",
+ "
"
+ ],
+ "text/plain": [
+ " alcohol malic_acid ash alcalinity_of_ash magnesium total_phenols \\\n",
+ "0 14.23 1.71 2.43 15.6 127.0 2.80 \n",
+ "1 13.20 1.78 2.14 11.2 100.0 2.65 \n",
+ "2 13.16 2.36 2.67 18.6 101.0 2.80 \n",
+ "3 14.37 1.95 2.50 16.8 113.0 3.85 \n",
+ "4 13.24 2.59 2.87 21.0 118.0 2.80 \n",
+ ".. ... ... ... ... ... ... \n",
+ "173 13.71 5.65 2.45 20.5 95.0 1.68 \n",
+ "174 13.40 3.91 2.48 23.0 102.0 1.80 \n",
+ "175 13.27 4.28 2.26 20.0 120.0 1.59 \n",
+ "176 13.17 2.59 2.37 20.0 120.0 1.65 \n",
+ "177 14.13 4.10 2.74 24.5 96.0 2.05 \n",
+ "\n",
+ " flavanoids nonflavanoid_phenols proanthocyanins color_intensity hue \\\n",
+ "0 3.06 0.28 2.29 5.64 1.04 \n",
+ "1 2.76 0.26 1.28 4.38 1.05 \n",
+ "2 3.24 0.30 2.81 5.68 1.03 \n",
+ "3 3.49 0.24 2.18 7.80 0.86 \n",
+ "4 2.69 0.39 1.82 4.32 1.04 \n",
+ ".. ... ... ... ... ... \n",
+ "173 0.61 0.52 1.06 7.70 0.64 \n",
+ "174 0.75 0.43 1.41 7.30 0.70 \n",
+ "175 0.69 0.43 1.35 10.20 0.59 \n",
+ "176 0.68 0.53 1.46 9.30 0.60 \n",
+ "177 0.76 0.56 1.35 9.20 0.61 \n",
+ "\n",
+ " od280/od315_of_diluted_wines proline class \n",
+ "0 3.92 1065.0 0 \n",
+ "1 3.40 1050.0 0 \n",
+ "2 3.17 1185.0 0 \n",
+ "3 3.45 1480.0 0 \n",
+ "4 2.93 735.0 0 \n",
+ ".. ... ... ... \n",
+ "173 1.74 740.0 2 \n",
+ "174 1.56 750.0 2 \n",
+ "175 1.56 835.0 2 \n",
+ "176 1.62 840.0 2 \n",
+ "177 1.60 560.0 2 \n",
+ "\n",
+ "[178 rows x 14 columns]"
+ ]
+ },
+ "execution_count": 3,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "from sklearn.datasets import load_wine\n",
+ "\n",
+ "# Load the Wine dataset\n",
+ "wine_data = load_wine()\n",
+ "\n",
+ "# Convert to DataFrame\n",
+ "wine_df = pd.DataFrame(wine_data.data, columns=wine_data.feature_names)\n",
+ "\n",
+ "# Bind the 'class' (wine target) to the DataFrame\n",
+ "wine_df['class'] = wine_data.target\n",
+ "\n",
+ "# Display the DataFrame\n",
+ "wine_df"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "d18dac17",
+ "metadata": {},
+ "source": [
+ "Question 1:\n",
+ "Data inspection\n",
+ "Before fitting any model, it is essential to understand our data. Use Python code to answer the following questions about the Wine dataset:\n",
+ "\n",
+ "(i) How many observations (rows) does the dataset contain?\n",
+ "(A): 178"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "a71846dc",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "178"
+ ]
+ },
+ "execution_count": 3,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "len(wine_df)\n",
+ "#178"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "465f3716",
+ "metadata": {},
+ "source": [
+ "(ii) How many variables (columns) does the dataset contain?\n",
+ "(A): 14"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "b0e9bab6",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "14"
+ ]
+ },
+ "execution_count": 4,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "len(wine_df.columns)\n",
+ "#14\n"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "b01dfe2d",
+ "metadata": {},
+ "source": [
+ "(iii) What is the 'variable type' of the response variable class (e.g., 'integer', 'category', etc.)? What are the 'levels' (unique values) of the variable?\n",
+ "(A): Class is a categorical variable represented here in integers stored in our dataframe as int64."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "221f70d8",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "array([0, 1, 2])"
+ ]
+ },
+ "execution_count": 6,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "wine_df['class'].dtype\n",
+ "#int64\n",
+ "wine_df['class'].unique()\n",
+ "#0, 1, 2"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "c86c7f27",
+ "metadata": {},
+ "source": [
+ "(iv) How many predictor variables do we have (Hint: all variables other than class)?\n",
+ "(A): 13"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "edf468d3",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "13"
+ ]
+ },
+ "execution_count": 7,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "len(wine_df.columns)-1\n",
+ "#13"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "4fe15c60",
+ "metadata": {},
+ "source": [
+ "Question 2:\n",
+ "Standardization and data-splitting\n",
+ "Next, we must preform 'pre-processing' or 'data munging', to prepare our data for classification/prediction. For KNN, there are three essential steps. A first essential step is to 'standardize' the predictor variables. We can achieve this using the scaler method, provided as follows:"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 4,
+ "id": "2c3d3d73",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ " alcohol malic_acid ash alcalinity_of_ash magnesium \\\n",
+ "0 1.518613 -0.562250 0.232053 -1.169593 1.913905 \n",
+ "1 0.246290 -0.499413 -0.827996 -2.490847 0.018145 \n",
+ "2 0.196879 0.021231 1.109334 -0.268738 0.088358 \n",
+ "3 1.691550 -0.346811 0.487926 -0.809251 0.930918 \n",
+ "4 0.295700 0.227694 1.840403 0.451946 1.281985 \n",
+ "\n",
+ " total_phenols flavanoids nonflavanoid_phenols proanthocyanins \\\n",
+ "0 0.808997 1.034819 -0.659563 1.224884 \n",
+ "1 0.568648 0.733629 -0.820719 -0.544721 \n",
+ "2 0.808997 1.215533 -0.498407 2.135968 \n",
+ "3 2.491446 1.466525 -0.981875 1.032155 \n",
+ "4 0.808997 0.663351 0.226796 0.401404 \n",
+ "\n",
+ " color_intensity hue od280/od315_of_diluted_wines proline \n",
+ "0 0.251717 0.362177 1.847920 1.013009 \n",
+ "1 -0.293321 0.406051 1.113449 0.965242 \n",
+ "2 0.269020 0.318304 0.788587 1.395148 \n",
+ "3 1.186068 -0.427544 1.184071 2.334574 \n",
+ "4 -0.319276 0.362177 0.449601 -0.037874 \n"
+ ]
+ }
+ ],
+ "source": [
+ "# Select predictors (excluding the last column)\n",
+ "predictors = wine_df.iloc[:, :-1]\n",
+ "\n",
+ "# Standardize the predictors\n",
+ "scaler = StandardScaler()\n",
+ "predictors_standardized = pd.DataFrame(scaler.fit_transform(predictors), columns=predictors.columns)\n",
+ "\n",
+ "# Display the head of the standardized predictors\n",
+ "print(predictors_standardized.head())"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "86b67b07",
+ "metadata": {},
+ "source": [
+ "(i) Why is it important to standardize the predictor variables?\n",
+ "(A): Knn relies on comparing distances between data points to fit a model to the data. Variables with large scales sway the distance computation more, influencing the estimation incorrectly.\n",
+ "\n",
+ "(ii) Why did we elect not to standard our response variable Class?\n",
+ "(A): Class is a categorical variable and is the quantity of interest, we would have to rescale our predictions for them to be interpretable.\n",
+ "\n",
+ "(iii) A second essential step is to set a random seed. Do so below (Hint: use the random.seed function). Why is setting a seed important? Is the particular seed value important? Why or why not?\n",
+ "\n",
+ "(A) A random seed is required to ensure the results are recreatable, locking in the random sequence used for testing. The seed value is not important, it serves as a variable to access the same random sequence used.\n",
+ "\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 5,
+ "id": "93bed139",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import random\n",
+ "random.seed(1)\n",
+ "np.random.seed(1)\n"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "b8a9a349",
+ "metadata": {},
+ "source": [
+ "(iv) A third essential step is to split our standardized data into separate training and testing sets. We will split into 75% training and 25% testing. The provided code randomly partitions our data, and creates linked training sets for the predictors and response variables.\n",
+ "\n",
+ "Extend the code to create a non-overlapping test set for the predictors and response variables.\n",
+ "# set a seed for reproducibility\n",
+ "np.random.seed(1)\n",
+ "\n",
+ "# split the data into a training and testing set. hint: use train_test_split !\n",
+ "\n",
+ "# Your code here ..."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 6,
+ "id": "203c93ee",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "np.random.seed(1)\n",
+ "from sklearn.model_selection import train_test_split\n",
+ "X = predictors_standardized\n",
+ "y = wine_df['class']\n",
+ "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=1)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "405f5aef",
+ "metadata": {},
+ "source": [
+ "Question 3:\n",
+ "Model initialization and cross-validation\n",
+ "We are finally set to fit the KNN model.\n",
+ "\n",
+ "Perform a grid search to tune the n_neighbors hyperparameter using 10-fold cross-validation. Follow these steps:\n",
+ "\n",
+ "1. Initialize the KNN classifier using KNeighborsClassifier().\n",
+ "2. Define a parameter grid for n_neighbors ranging from 1 to 50.\n",
+ "3. Implement a grid search using GridSearchCV with 10-fold cross-validation to find the optimal number of neighbors.\n",
+ "4. After fitting the model on the training data, identify and return the best value for n_neighbors based on the grid search results."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 7,
+ "id": "6204d2d1",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "The best value of k is: 13\n"
+ ]
+ }
+ ],
+ "source": [
+ "from sklearn.neighbors import KNeighborsClassifier\n",
+ "from sklearn.model_selection import GridSearchCV\n",
+ "\n",
+ "#1 Initialize the KNN classifier\n",
+ "knn = KNeighborsClassifier()\n",
+ "\n",
+ "#2 Define the parameter grid for 'n_neighbors'\n",
+ "param_grid = {'n_neighbors': list(range(1, 50))}\n",
+ "\n",
+ "#3 Set up GridSearchCV with 10-fold cross-validation\n",
+ "grid_search = GridSearchCV(estimator=knn, param_grid=param_grid, cv=10)\n",
+ "\n",
+ "#4 Fit the GridSearchCV to the training data\n",
+ "grid_search.fit(X_train, y_train)\n",
+ "\n",
+ "#5 Get the best 'n_neighbors' value\n",
+ "best_k = grid_search.best_params_['n_neighbors']\n",
+ "print(f\"The best value of k is: {best_k}\")\n"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "ba451349",
+ "metadata": {},
+ "source": [
+ "Question 4:\n",
+ "Model evaluation\n",
+ "Using the best value for n_neighbors, fit a KNN model on the training data and evaluate its performance on the test set using accuracy_score."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 8,
+ "id": "3127c4c1",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Test Accuracy: 0.9777777777777777\n"
+ ]
+ }
+ ],
+ "source": [
+ "from sklearn.neighbors import KNeighborsClassifier\n",
+ "from sklearn.metrics import accuracy_score\n",
+ "\n",
+ "knn_best = KNeighborsClassifier(n_neighbors=best_k)\n",
+ "knn_best.fit(X_train, y_train)\n",
+ "y_pred = knn_best.predict(X_test)\n",
+ "accuracy = accuracy_score(y_test, y_pred)\n",
+ "print(\"Test Accuracy:\", accuracy)\n"
+ ]
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "lcr-env",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.11.15"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}