diff --git a/02_activities/assignments/assignment_1.ipynb b/02_activities/assignments/assignment_1.ipynb index 1d25bbcb3..bfd8c122b 100644 --- a/02_activities/assignments/assignment_1.ipynb +++ b/02_activities/assignments/assignment_1.ipynb @@ -34,7 +34,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 1, "id": "4a3485d6-ba58-4660-a983-5680821c5719", "metadata": {}, "outputs": [], @@ -56,10 +56,288 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 37, "id": "a431d282-f9ca-4d5d-8912-71ffc9d8ea19", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
alcoholmalic_acidashalcalinity_of_ashmagnesiumtotal_phenolsflavanoidsnonflavanoid_phenolsproanthocyaninscolor_intensityhueod280/od315_of_diluted_winesprolineclass
014.231.712.4315.6127.02.803.060.282.295.641.043.921065.00
113.201.782.1411.2100.02.652.760.261.284.381.053.401050.00
213.162.362.6718.6101.02.803.240.302.815.681.033.171185.00
314.371.952.5016.8113.03.853.490.242.187.800.863.451480.00
413.242.592.8721.0118.02.802.690.391.824.321.042.93735.00
.............................................
17313.715.652.4520.595.01.680.610.521.067.700.641.74740.02
17413.403.912.4823.0102.01.800.750.431.417.300.701.56750.02
17513.274.282.2620.0120.01.590.690.431.3510.200.591.56835.02
17613.172.592.3720.0120.01.650.680.531.469.300.601.62840.02
17714.134.102.7424.596.02.050.760.561.359.200.611.60560.02
\n", + "

178 rows × 14 columns

\n", + "
" + ], + "text/plain": [ + " alcohol malic_acid ash alcalinity_of_ash magnesium total_phenols \\\n", + "0 14.23 1.71 2.43 15.6 127.0 2.80 \n", + "1 13.20 1.78 2.14 11.2 100.0 2.65 \n", + "2 13.16 2.36 2.67 18.6 101.0 2.80 \n", + "3 14.37 1.95 2.50 16.8 113.0 3.85 \n", + "4 13.24 2.59 2.87 21.0 118.0 2.80 \n", + ".. ... ... ... ... ... ... \n", + "173 13.71 5.65 2.45 20.5 95.0 1.68 \n", + "174 13.40 3.91 2.48 23.0 102.0 1.80 \n", + "175 13.27 4.28 2.26 20.0 120.0 1.59 \n", + "176 13.17 2.59 2.37 20.0 120.0 1.65 \n", + "177 14.13 4.10 2.74 24.5 96.0 2.05 \n", + "\n", + " flavanoids nonflavanoid_phenols proanthocyanins color_intensity hue \\\n", + "0 3.06 0.28 2.29 5.64 1.04 \n", + "1 2.76 0.26 1.28 4.38 1.05 \n", + "2 3.24 0.30 2.81 5.68 1.03 \n", + "3 3.49 0.24 2.18 7.80 0.86 \n", + "4 2.69 0.39 1.82 4.32 1.04 \n", + ".. ... ... ... ... ... \n", + "173 0.61 0.52 1.06 7.70 0.64 \n", + "174 0.75 0.43 1.41 7.30 0.70 \n", + "175 0.69 0.43 1.35 10.20 0.59 \n", + "176 0.68 0.53 1.46 9.30 0.60 \n", + "177 0.76 0.56 1.35 9.20 0.61 \n", + "\n", + " od280/od315_of_diluted_wines proline class \n", + "0 3.92 1065.0 0 \n", + "1 3.40 1050.0 0 \n", + "2 3.17 1185.0 0 \n", + "3 3.45 1480.0 0 \n", + "4 2.93 735.0 0 \n", + ".. ... ... ... \n", + "173 1.74 740.0 2 \n", + "174 1.56 750.0 2 \n", + "175 1.56 835.0 2 \n", + "176 1.62 840.0 2 \n", + "177 1.60 560.0 2 \n", + "\n", + "[178 rows x 14 columns]" + ] + }, + "execution_count": 37, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "from sklearn.datasets import load_wine\n", "\n", @@ -96,7 +374,7 @@ "metadata": {}, "outputs": [], "source": [ - "# Your answer here" + "# 178 rows" ] }, { @@ -114,7 +392,9 @@ "metadata": {}, "outputs": [], "source": [ - "# Your answer here" + "# Your answer here\n", + "\n", + "# 14 columns" ] }, { @@ -127,12 +407,24 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 58, "id": "47989426", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[0 1 2]\n" + ] + } + ], "source": [ - "# Your answer here" + "# I'm just double checking how many values there are, since I can't display all of them\n", + "uniques = wine_df['class'].unique()\n", + "print(uniques)\n", + "\n", + "# 'class' is an integer variable, and the levels are 0, 1, or 2" ] }, { @@ -151,7 +443,7 @@ "metadata": {}, "outputs": [], "source": [ - "# Your answer here" + "# 13 predictors" ] }, { @@ -175,10 +467,37 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 33, "id": "cc899b59", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " alcohol malic_acid ash alcalinity_of_ash magnesium \\\n", + "0 1.518613 -0.562250 0.232053 -1.169593 1.913905 \n", + "1 0.246290 -0.499413 -0.827996 -2.490847 0.018145 \n", + "2 0.196879 0.021231 1.109334 -0.268738 0.088358 \n", + "3 1.691550 -0.346811 0.487926 -0.809251 0.930918 \n", + "4 0.295700 0.227694 1.840403 0.451946 1.281985 \n", + "\n", + " total_phenols flavanoids nonflavanoid_phenols proanthocyanins \\\n", + "0 0.808997 1.034819 -0.659563 1.224884 \n", + "1 0.568648 0.733629 -0.820719 -0.544721 \n", + "2 0.808997 1.215533 -0.498407 2.135968 \n", + "3 2.491446 1.466525 -0.981875 1.032155 \n", + "4 0.808997 0.663351 0.226796 0.401404 \n", + "\n", + " color_intensity hue od280/od315_of_diluted_wines proline \n", + "0 0.251717 0.362177 1.847920 1.013009 \n", + "1 -0.293321 0.406051 1.113449 0.965242 \n", + "2 0.269020 0.318304 0.788587 1.395148 \n", + "3 1.186068 -0.427544 1.184071 2.334574 \n", + "4 -0.319276 0.362177 0.449601 -0.037874 \n" + ] + } + ], "source": [ "# Select predictors (excluding the last column)\n", "predictors = wine_df.iloc[:, :-1]\n", @@ -204,7 +523,7 @@ "id": "403ef0bb", "metadata": {}, "source": [ - "> Your answer here..." + "Standardizing the predictor variables makes them more comparable, and makes sure that variables with small absolute values are not under-represented in the later models or statistics." ] }, { @@ -220,7 +539,9 @@ "id": "fdee5a15", "metadata": {}, "source": [ - "> Your answer here..." + "'class' is an integer variable and standardizing it would be meaningless, since it can only be represented by 0, 1, or 2.\n", + "\n", + "More generally, standardizing continuous response variables can make it difficult to interpret results, since the standardization removes the magnitude of real changes." ] }, { @@ -236,7 +557,7 @@ "id": "f0676c21", "metadata": {}, "source": [ - "> Your answer here..." + "It's important to set a seed to ensure that later analyses are reproducable, either for yourself later on, or for future researchers working with the same data. The specific value of the seed is not very important, since it just gives the computer a 'starting point' for random number generation." ] }, { @@ -260,8 +581,14 @@ "np.random.seed(123)\n", "\n", "# split the data into a training and testing set. hint: use train_test_split !\n", + "x_train, x_test, y_train, y_test = train_test_split(\n", + " predictors_standardized,\n", + " wine_df['class'],\n", + " test_size = 0.25,\n", + ")\n", "\n", - "# Your code here ..." + "# I wrote the code by modifying what we use in our live-learning sessions with the code found here\n", + "# https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.train_test_split.html" ] }, { @@ -287,9 +614,46 @@ "execution_count": null, "id": "08818c64", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "15\n" + ] + } + ], "source": [ - "# Your code here..." + "# 1. initialize KNN, this just turns the function into an object to call into step 3\n", + "knn = KNeighborsClassifier()\n", + "\n", + "# 2. Define a parameter grid for n_neighbours randing 1 - 50\n", + "# This creates a list called param_grid that holds numbers 1-50\n", + "param_grid = {\n", + " 'n_neighbors': list(range(1, 51))\n", + "}\n", + "\n", + "# 3. Implement grid search using GridSearchCV with 10-fold CV\n", + "grid_search = GridSearchCV(\n", + " # Call the knn object created earlier as the estimator\n", + " estimator=knn,\n", + " # Call the list of 1-50 created earlier\n", + " param_grid=param_grid,\n", + " # establish the cross-validation as 10-fold\n", + " cv=10,\n", + " # searching for heighest accuracy, versus 'precision', 'recall', etc.\n", + " scoring='accuracy'\n", + ")\n", + "\n", + "# 4. Fit the model to the training data and return best value for n_neighbours\n", + "grid_search.fit(x_train, y_train)\n", + "# Extract out the optimal value for 'n_neighbors' as defined in Step # 3 above\n", + "best_k = grid_search.best_params_['n_neighbors']\n", + "# Print the value\n", + "print(best_k)\n", + "\n", + "# I wrote this code by modifying the code found here:\n", + "# https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.GridSearchCV.html" ] }, { @@ -305,12 +669,35 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 68, "id": "ffefa9f2", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "0.9333333333333333\n" + ] + } + ], "source": [ - "# Your code here..." + "# First, initialize the KNN like last question, but using the best_k\n", + "knn_best = KNeighborsClassifier(n_neighbors=best_k)\n", + "\n", + "# Now fit the knn_best onto the x and y training data\n", + "knn_best.fit(x_train, y_train)\n", + "\n", + "# Take that fitted knn and predict it onto the test data --> only predictor variables (x variables)\n", + "# the response variable (y variable) is not used here because the next step is to evaluate how accurate the prediction is\n", + "y_pred = knn_best.predict(x_test)\n", + "\n", + "# Now use accuracy_score to measure how often the model predictions are correct\n", + "# This divides the number of correct predictions by the number of total predictions, giving a proportion\n", + "test_accuracy = accuracy_score(y_test, y_pred)\n", + "\n", + "# Print the result of test_accuracy\n", + "print(test_accuracy)" ] }, { @@ -365,7 +752,7 @@ ], "metadata": { "kernelspec": { - "display_name": "Python 3.10.4", + "display_name": "lcr-env", "language": "python", "name": "python3" }, @@ -379,12 +766,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.9.19" - }, - "vscode": { - "interpreter": { - "hash": "497a84dc8fec8cf8d24e7e87b6d954c9a18a327edc66feb9b9ea7e9e72cc5c7e" - } + "version": "3.11.14" } }, "nbformat": 4,