diff --git a/02_activities/assignments/assignment1.ipynb b/02_activities/assignments/assignment1.ipynb new file mode 100644 index 000000000..a75d3a268 --- /dev/null +++ b/02_activities/assignments/assignment1.ipynb @@ -0,0 +1,678 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 2, + "id": "3f73d818", + "metadata": {}, + "outputs": [], + "source": [ + "# Import standard libraries\n", + "import pandas as pd\n", + "import numpy as np\n", + "import random\n", + "import matplotlib.pyplot as plt\n", + "import matplotlib.colors as mcolors\n", + "from sklearn.preprocessing import StandardScaler\n", + "from sklearn.model_selection import train_test_split\n", + "from sklearn.neighbors import KNeighborsClassifier\n", + "from sklearn.metrics import recall_score, precision_score\n", + "from sklearn.model_selection import cross_validate\n", + "from sklearn.model_selection import GridSearchCV\n", + "from sklearn.metrics import accuracy_score" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "bb80c709", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
alcoholmalic_acidashalcalinity_of_ashmagnesiumtotal_phenolsflavanoidsnonflavanoid_phenolsproanthocyaninscolor_intensityhueod280/od315_of_diluted_winesprolineclass
014.231.712.4315.6127.02.803.060.282.295.641.043.921065.00
113.201.782.1411.2100.02.652.760.261.284.381.053.401050.00
213.162.362.6718.6101.02.803.240.302.815.681.033.171185.00
314.371.952.5016.8113.03.853.490.242.187.800.863.451480.00
413.242.592.8721.0118.02.802.690.391.824.321.042.93735.00
.............................................
17313.715.652.4520.595.01.680.610.521.067.700.641.74740.02
17413.403.912.4823.0102.01.800.750.431.417.300.701.56750.02
17513.274.282.2620.0120.01.590.690.431.3510.200.591.56835.02
17613.172.592.3720.0120.01.650.680.531.469.300.601.62840.02
17714.134.102.7424.596.02.050.760.561.359.200.611.60560.02
\n", + "

178 rows × 14 columns

\n", + "
" + ], + "text/plain": [ + " alcohol malic_acid ash alcalinity_of_ash magnesium total_phenols \\\n", + "0 14.23 1.71 2.43 15.6 127.0 2.80 \n", + "1 13.20 1.78 2.14 11.2 100.0 2.65 \n", + "2 13.16 2.36 2.67 18.6 101.0 2.80 \n", + "3 14.37 1.95 2.50 16.8 113.0 3.85 \n", + "4 13.24 2.59 2.87 21.0 118.0 2.80 \n", + ".. ... ... ... ... ... ... \n", + "173 13.71 5.65 2.45 20.5 95.0 1.68 \n", + "174 13.40 3.91 2.48 23.0 102.0 1.80 \n", + "175 13.27 4.28 2.26 20.0 120.0 1.59 \n", + "176 13.17 2.59 2.37 20.0 120.0 1.65 \n", + "177 14.13 4.10 2.74 24.5 96.0 2.05 \n", + "\n", + " flavanoids nonflavanoid_phenols proanthocyanins color_intensity hue \\\n", + "0 3.06 0.28 2.29 5.64 1.04 \n", + "1 2.76 0.26 1.28 4.38 1.05 \n", + "2 3.24 0.30 2.81 5.68 1.03 \n", + "3 3.49 0.24 2.18 7.80 0.86 \n", + "4 2.69 0.39 1.82 4.32 1.04 \n", + ".. ... ... ... ... ... \n", + "173 0.61 0.52 1.06 7.70 0.64 \n", + "174 0.75 0.43 1.41 7.30 0.70 \n", + "175 0.69 0.43 1.35 10.20 0.59 \n", + "176 0.68 0.53 1.46 9.30 0.60 \n", + "177 0.76 0.56 1.35 9.20 0.61 \n", + "\n", + " od280/od315_of_diluted_wines proline class \n", + "0 3.92 1065.0 0 \n", + "1 3.40 1050.0 0 \n", + "2 3.17 1185.0 0 \n", + "3 3.45 1480.0 0 \n", + "4 2.93 735.0 0 \n", + ".. ... ... ... \n", + "173 1.74 740.0 2 \n", + "174 1.56 750.0 2 \n", + "175 1.56 835.0 2 \n", + "176 1.62 840.0 2 \n", + "177 1.60 560.0 2 \n", + "\n", + "[178 rows x 14 columns]" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from sklearn.datasets import load_wine\n", + "\n", + "# Load the Wine dataset\n", + "wine_data = load_wine()\n", + "\n", + "# Convert to DataFrame\n", + "wine_df = pd.DataFrame(wine_data.data, columns=wine_data.feature_names)\n", + "\n", + "# Bind the 'class' (wine target) to the DataFrame\n", + "wine_df['class'] = wine_data.target\n", + "\n", + "# Display the DataFrame\n", + "wine_df" + ] + }, + { + "cell_type": "markdown", + "id": "d18dac17", + "metadata": {}, + "source": [ + "Question 1:\n", + "Data inspection\n", + "Before fitting any model, it is essential to understand our data. Use Python code to answer the following questions about the Wine dataset:\n", + "\n", + "(i) How many observations (rows) does the dataset contain?\n", + "(A): 178" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a71846dc", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "178" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "len(wine_df)\n", + "#178" + ] + }, + { + "cell_type": "markdown", + "id": "465f3716", + "metadata": {}, + "source": [ + "(ii) How many variables (columns) does the dataset contain?\n", + "(A): 14" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b0e9bab6", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "14" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "len(wine_df.columns)\n", + "#14\n" + ] + }, + { + "cell_type": "markdown", + "id": "b01dfe2d", + "metadata": {}, + "source": [ + "(iii) What is the 'variable type' of the response variable class (e.g., 'integer', 'category', etc.)? What are the 'levels' (unique values) of the variable?\n", + "(A): Class is a categorical variable represented here in integers stored in our dataframe as int64." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "221f70d8", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array([0, 1, 2])" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "wine_df['class'].dtype\n", + "#int64\n", + "wine_df['class'].unique()\n", + "#0, 1, 2" + ] + }, + { + "cell_type": "markdown", + "id": "c86c7f27", + "metadata": {}, + "source": [ + "(iv) How many predictor variables do we have (Hint: all variables other than class)?\n", + "(A): 13" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "edf468d3", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "13" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "len(wine_df.columns)-1\n", + "#13" + ] + }, + { + "cell_type": "markdown", + "id": "4fe15c60", + "metadata": {}, + "source": [ + "Question 2:\n", + "Standardization and data-splitting\n", + "Next, we must preform 'pre-processing' or 'data munging', to prepare our data for classification/prediction. For KNN, there are three essential steps. A first essential step is to 'standardize' the predictor variables. We can achieve this using the scaler method, provided as follows:" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "2c3d3d73", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " alcohol malic_acid ash alcalinity_of_ash magnesium \\\n", + "0 1.518613 -0.562250 0.232053 -1.169593 1.913905 \n", + "1 0.246290 -0.499413 -0.827996 -2.490847 0.018145 \n", + "2 0.196879 0.021231 1.109334 -0.268738 0.088358 \n", + "3 1.691550 -0.346811 0.487926 -0.809251 0.930918 \n", + "4 0.295700 0.227694 1.840403 0.451946 1.281985 \n", + "\n", + " total_phenols flavanoids nonflavanoid_phenols proanthocyanins \\\n", + "0 0.808997 1.034819 -0.659563 1.224884 \n", + "1 0.568648 0.733629 -0.820719 -0.544721 \n", + "2 0.808997 1.215533 -0.498407 2.135968 \n", + "3 2.491446 1.466525 -0.981875 1.032155 \n", + "4 0.808997 0.663351 0.226796 0.401404 \n", + "\n", + " color_intensity hue od280/od315_of_diluted_wines proline \n", + "0 0.251717 0.362177 1.847920 1.013009 \n", + "1 -0.293321 0.406051 1.113449 0.965242 \n", + "2 0.269020 0.318304 0.788587 1.395148 \n", + "3 1.186068 -0.427544 1.184071 2.334574 \n", + "4 -0.319276 0.362177 0.449601 -0.037874 \n" + ] + } + ], + "source": [ + "# Select predictors (excluding the last column)\n", + "predictors = wine_df.iloc[:, :-1]\n", + "\n", + "# Standardize the predictors\n", + "scaler = StandardScaler()\n", + "predictors_standardized = pd.DataFrame(scaler.fit_transform(predictors), columns=predictors.columns)\n", + "\n", + "# Display the head of the standardized predictors\n", + "print(predictors_standardized.head())" + ] + }, + { + "cell_type": "markdown", + "id": "86b67b07", + "metadata": {}, + "source": [ + "(i) Why is it important to standardize the predictor variables?\n", + "(A): Knn relies on comparing distances between data points to fit a model to the data. Variables with large scales sway the distance computation more, influencing the estimation incorrectly.\n", + "\n", + "(ii) Why did we elect not to standard our response variable Class?\n", + "(A): Class is a categorical variable and is the quantity of interest, we would have to rescale our predictions for them to be interpretable.\n", + "\n", + "(iii) A second essential step is to set a random seed. Do so below (Hint: use the random.seed function). Why is setting a seed important? Is the particular seed value important? Why or why not?\n", + "\n", + "(A) A random seed is required to ensure the results are recreatable, locking in the random sequence used for testing. The seed value is not important, it serves as a variable to access the same random sequence used.\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "93bed139", + "metadata": {}, + "outputs": [], + "source": [ + "import random\n", + "random.seed(1)\n", + "np.random.seed(1)\n" + ] + }, + { + "cell_type": "markdown", + "id": "b8a9a349", + "metadata": {}, + "source": [ + "(iv) A third essential step is to split our standardized data into separate training and testing sets. We will split into 75% training and 25% testing. The provided code randomly partitions our data, and creates linked training sets for the predictors and response variables.\n", + "\n", + "Extend the code to create a non-overlapping test set for the predictors and response variables.\n", + "# set a seed for reproducibility\n", + "np.random.seed(1)\n", + "\n", + "# split the data into a training and testing set. hint: use train_test_split !\n", + "\n", + "# Your code here ..." + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "203c93ee", + "metadata": {}, + "outputs": [], + "source": [ + "np.random.seed(1)\n", + "from sklearn.model_selection import train_test_split\n", + "X = predictors_standardized\n", + "y = wine_df['class']\n", + "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=1)" + ] + }, + { + "cell_type": "markdown", + "id": "405f5aef", + "metadata": {}, + "source": [ + "Question 3:\n", + "Model initialization and cross-validation\n", + "We are finally set to fit the KNN model.\n", + "\n", + "Perform a grid search to tune the n_neighbors hyperparameter using 10-fold cross-validation. Follow these steps:\n", + "\n", + "1. Initialize the KNN classifier using KNeighborsClassifier().\n", + "2. Define a parameter grid for n_neighbors ranging from 1 to 50.\n", + "3. Implement a grid search using GridSearchCV with 10-fold cross-validation to find the optimal number of neighbors.\n", + "4. After fitting the model on the training data, identify and return the best value for n_neighbors based on the grid search results." + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "6204d2d1", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "The best value of k is: 13\n" + ] + } + ], + "source": [ + "from sklearn.neighbors import KNeighborsClassifier\n", + "from sklearn.model_selection import GridSearchCV\n", + "\n", + "#1 Initialize the KNN classifier\n", + "knn = KNeighborsClassifier()\n", + "\n", + "#2 Define the parameter grid for 'n_neighbors'\n", + "param_grid = {'n_neighbors': list(range(1, 50))}\n", + "\n", + "#3 Set up GridSearchCV with 10-fold cross-validation\n", + "grid_search = GridSearchCV(estimator=knn, param_grid=param_grid, cv=10)\n", + "\n", + "#4 Fit the GridSearchCV to the training data\n", + "grid_search.fit(X_train, y_train)\n", + "\n", + "#5 Get the best 'n_neighbors' value\n", + "best_k = grid_search.best_params_['n_neighbors']\n", + "print(f\"The best value of k is: {best_k}\")\n" + ] + }, + { + "cell_type": "markdown", + "id": "ba451349", + "metadata": {}, + "source": [ + "Question 4:\n", + "Model evaluation\n", + "Using the best value for n_neighbors, fit a KNN model on the training data and evaluate its performance on the test set using accuracy_score." + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "3127c4c1", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Test Accuracy: 0.9777777777777777\n" + ] + } + ], + "source": [ + "from sklearn.neighbors import KNeighborsClassifier\n", + "from sklearn.metrics import accuracy_score\n", + "\n", + "knn_best = KNeighborsClassifier(n_neighbors=best_k)\n", + "knn_best.fit(X_train, y_train)\n", + "y_pred = knn_best.predict(X_test)\n", + "accuracy = accuracy_score(y_test, y_pred)\n", + "print(\"Test Accuracy:\", accuracy)\n" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "lcr-env", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.15" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +}