diff --git a/02_activities/assignments/assignment_1.ipynb b/02_activities/assignments/assignment_1.ipynb index 1d25bbcb3..d73df0f07 100644 --- a/02_activities/assignments/assignment_1.ipynb +++ b/02_activities/assignments/assignment_1.ipynb @@ -34,10 +34,18 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 1, "id": "4a3485d6-ba58-4660-a983-5680821c5719", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Matplotlib is building the font cache; this may take a moment.\n" + ] + } + ], "source": [ "# Import standard libraries\n", "import pandas as pd\n", @@ -56,10 +64,288 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 2, "id": "a431d282-f9ca-4d5d-8912-71ffc9d8ea19", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
alcoholmalic_acidashalcalinity_of_ashmagnesiumtotal_phenolsflavanoidsnonflavanoid_phenolsproanthocyaninscolor_intensityhueod280/od315_of_diluted_winesprolineclass
014.231.712.4315.6127.02.803.060.282.295.641.043.921065.00
113.201.782.1411.2100.02.652.760.261.284.381.053.401050.00
213.162.362.6718.6101.02.803.240.302.815.681.033.171185.00
314.371.952.5016.8113.03.853.490.242.187.800.863.451480.00
413.242.592.8721.0118.02.802.690.391.824.321.042.93735.00
.............................................
17313.715.652.4520.595.01.680.610.521.067.700.641.74740.02
17413.403.912.4823.0102.01.800.750.431.417.300.701.56750.02
17513.274.282.2620.0120.01.590.690.431.3510.200.591.56835.02
17613.172.592.3720.0120.01.650.680.531.469.300.601.62840.02
17714.134.102.7424.596.02.050.760.561.359.200.611.60560.02
\n", + "

178 rows × 14 columns

\n", + "
" + ], + "text/plain": [ + " alcohol malic_acid ash alcalinity_of_ash magnesium total_phenols \\\n", + "0 14.23 1.71 2.43 15.6 127.0 2.80 \n", + "1 13.20 1.78 2.14 11.2 100.0 2.65 \n", + "2 13.16 2.36 2.67 18.6 101.0 2.80 \n", + "3 14.37 1.95 2.50 16.8 113.0 3.85 \n", + "4 13.24 2.59 2.87 21.0 118.0 2.80 \n", + ".. ... ... ... ... ... ... \n", + "173 13.71 5.65 2.45 20.5 95.0 1.68 \n", + "174 13.40 3.91 2.48 23.0 102.0 1.80 \n", + "175 13.27 4.28 2.26 20.0 120.0 1.59 \n", + "176 13.17 2.59 2.37 20.0 120.0 1.65 \n", + "177 14.13 4.10 2.74 24.5 96.0 2.05 \n", + "\n", + " flavanoids nonflavanoid_phenols proanthocyanins color_intensity hue \\\n", + "0 3.06 0.28 2.29 5.64 1.04 \n", + "1 2.76 0.26 1.28 4.38 1.05 \n", + "2 3.24 0.30 2.81 5.68 1.03 \n", + "3 3.49 0.24 2.18 7.80 0.86 \n", + "4 2.69 0.39 1.82 4.32 1.04 \n", + ".. ... ... ... ... ... \n", + "173 0.61 0.52 1.06 7.70 0.64 \n", + "174 0.75 0.43 1.41 7.30 0.70 \n", + "175 0.69 0.43 1.35 10.20 0.59 \n", + "176 0.68 0.53 1.46 9.30 0.60 \n", + "177 0.76 0.56 1.35 9.20 0.61 \n", + "\n", + " od280/od315_of_diluted_wines proline class \n", + "0 3.92 1065.0 0 \n", + "1 3.40 1050.0 0 \n", + "2 3.17 1185.0 0 \n", + "3 3.45 1480.0 0 \n", + "4 2.93 735.0 0 \n", + ".. ... ... ... \n", + "173 1.74 740.0 2 \n", + "174 1.56 750.0 2 \n", + "175 1.56 835.0 2 \n", + "176 1.62 840.0 2 \n", + "177 1.60 560.0 2 \n", + "\n", + "[178 rows x 14 columns]" + ] + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "from sklearn.datasets import load_wine\n", "\n", @@ -94,9 +380,18 @@ "execution_count": null, "id": "56916892", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "178\n" + ] + } + ], "source": [ - "# Your answer here" + "print(wine_df.shape[0])\n", + "# 178" ] }, { @@ -112,9 +407,18 @@ "execution_count": null, "id": "df0ef103", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "14\n" + ] + } + ], "source": [ - "# Your answer here" + "print(wine_df.shape[1])\n", + "# 14" ] }, { @@ -130,9 +434,20 @@ "execution_count": null, "id": "47989426", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "int64\n", + "[0 1 2]\n" + ] + } + ], "source": [ - "# Your answer here" + "print(wine_df['class'].dtype)\n", + "print(wine_df['class'].unique())\n", + "# integer, 3 levels [0 1 2]" ] }, { @@ -146,12 +461,21 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 7, "id": "bd7b0910", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "13\n" + ] + } + ], "source": [ - "# Your answer here" + "print(wine_df.shape[1] - 1)\n", + "# 13" ] }, { @@ -175,10 +499,37 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 8, "id": "cc899b59", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " alcohol malic_acid ash alcalinity_of_ash magnesium \\\n", + "0 1.518613 -0.562250 0.232053 -1.169593 1.913905 \n", + "1 0.246290 -0.499413 -0.827996 -2.490847 0.018145 \n", + "2 0.196879 0.021231 1.109334 -0.268738 0.088358 \n", + "3 1.691550 -0.346811 0.487926 -0.809251 0.930918 \n", + "4 0.295700 0.227694 1.840403 0.451946 1.281985 \n", + "\n", + " total_phenols flavanoids nonflavanoid_phenols proanthocyanins \\\n", + "0 0.808997 1.034819 -0.659563 1.224884 \n", + "1 0.568648 0.733629 -0.820719 -0.544721 \n", + "2 0.808997 1.215533 -0.498407 2.135968 \n", + "3 2.491446 1.466525 -0.981875 1.032155 \n", + "4 0.808997 0.663351 0.226796 0.401404 \n", + "\n", + " color_intensity hue od280/od315_of_diluted_wines proline \n", + "0 0.251717 0.362177 1.847920 1.013009 \n", + "1 -0.293321 0.406051 1.113449 0.965242 \n", + "2 0.269020 0.318304 0.788587 1.395148 \n", + "3 1.186068 -0.427544 1.184071 2.334574 \n", + "4 -0.319276 0.362177 0.449601 -0.037874 \n" + ] + } + ], "source": [ "# Select predictors (excluding the last column)\n", "predictors = wine_df.iloc[:, :-1]\n", @@ -204,7 +555,7 @@ "id": "403ef0bb", "metadata": {}, "source": [ - "> Your answer here..." + "> I think standardization might be helpful since KNNs use euclidean distance to find the neighbours. But by that same toke, that means features with alrger scaled would dominate over features with smaller scales, which makes those distances kind of pointless. " ] }, { @@ -220,7 +571,7 @@ "id": "fdee5a15", "metadata": {}, "source": [ - "> Your answer here..." + "> The variable is categorical " ] }, { @@ -236,7 +587,7 @@ "id": "f0676c21", "metadata": {}, "source": [ - "> Your answer here..." + "> Yes, its important because it allows reproducability. This means you can get the same results every time you run the code. Strictly speaking, the value doesn't matter as much, as long as its consistent between everyone. " ] }, { @@ -251,7 +602,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 9, "id": "72c101f2", "metadata": {}, "outputs": [], @@ -261,7 +612,12 @@ "\n", "# split the data into a training and testing set. hint: use train_test_split !\n", "\n", - "# Your code here ..." + "X_train, X_test, y_train, y_test = train_test_split(\n", + " predictors_standardized,\n", + " wine_df['class'],\n", + " test_size=0.25,\n", + " random_state=123\n", + ")\n" ] }, { @@ -284,12 +640,27 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 10, "id": "08818c64", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{'n_neighbors': 15}\n" + ] + } + ], "source": [ - "# Your code here..." + "knn = KNeighborsClassifier()\n", + "\n", + "param_grid = {'n_neighbors': range(1, 51)}\n", + "\n", + "grid_search = GridSearchCV(knn, param_grid, cv=10)\n", + "grid_search.fit(X_train, y_train)\n", + "\n", + "print(grid_search.best_params_)\n" ] }, { @@ -305,12 +676,24 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 12, "id": "ffefa9f2", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "0.9333333333333333\n" + ] + } + ], "source": [ - "# Your code here..." + "best_knn = KNeighborsClassifier(n_neighbors=grid_search.best_params_['n_neighbors'])\n", + "best_knn.fit(X_train, y_train)\n", + "\n", + "y_pred = best_knn.predict(X_test)\n", + "print(accuracy_score(y_test, y_pred))" ] }, { @@ -365,7 +748,7 @@ ], "metadata": { "kernelspec": { - "display_name": "Python 3.10.4", + "display_name": "Python 3", "language": "python", "name": "python3" }, @@ -379,12 +762,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.9.19" - }, - "vscode": { - "interpreter": { - "hash": "497a84dc8fec8cf8d24e7e87b6d954c9a18a327edc66feb9b9ea7e9e72cc5c7e" - } + "version": "3.11.6" } }, "nbformat": 4,