ph_ny_mat_sci/dz_les_6.ipynb
2022-11-30 12:31:57 +03:00

3983 lines
135 KiB
Plaintext
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

{
"cells": [
{
"cell_type": "markdown",
"id": "e4f5991e",
"metadata": {},
"source": [
"# Тема “Обучение с учителем”"
]
},
{
"cell_type": "markdown",
"id": "2442aab9",
"metadata": {},
"source": [
"## Задание 1\n",
"Импортируйте библиотеки pandas и numpy.\n",
"\n",
"Загрузите \"Boston House Prices dataset\" из встроенных наборов данных библиотеки sklearn. Создайте датафреймы X и y из этих данных.\n",
"\n",
"Разбейте эти датафреймы на тренировочные (X_train, y_train) и тестовые (X_test, y_test) с помощью функции train_test_split так, чтобы размер тестовой выборки составлял 30% от всех данных, при этом аргумент random_state должен быть равен 42.\n",
"\n",
"Создайте модель линейной регрессии под названием lr с помощью класса LinearRegression из модуля sklearn.linear_model.\n",
"\n",
"Обучите модель на тренировочных данных (используйте все признаки) и сделайте предсказание на тестовых.\n",
"\n",
"Вычислите R2 полученных предказаний с помощью r2_score из модуля sklearn.metrics."
]
},
{
"cell_type": "code",
"execution_count": 1,
"id": "f79ac751",
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"import numpy as np"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "483c687f",
"metadata": {},
"outputs": [],
"source": [
"from sklearn.datasets import load_boston"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "fd8d693f",
"metadata": {
"scrolled": true
},
"outputs": [],
"source": [
"import warnings\n",
"from sklearn.datasets import load_boston\n",
"with warnings.catch_warnings():\n",
" # You should probably not use this dataset.\n",
" warnings.filterwarnings(\"ignore\")\n",
" boston = load_boston()\n",
"data = boston[\"data\"]"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "81339e3e",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>CRIM</th>\n",
" <th>ZN</th>\n",
" <th>INDUS</th>\n",
" <th>CHAS</th>\n",
" <th>NOX</th>\n",
" <th>RM</th>\n",
" <th>AGE</th>\n",
" <th>DIS</th>\n",
" <th>RAD</th>\n",
" <th>TAX</th>\n",
" <th>PTRATIO</th>\n",
" <th>B</th>\n",
" <th>LSTAT</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>0.00632</td>\n",
" <td>18.0</td>\n",
" <td>2.31</td>\n",
" <td>0.0</td>\n",
" <td>0.538</td>\n",
" <td>6.575</td>\n",
" <td>65.2</td>\n",
" <td>4.0900</td>\n",
" <td>1.0</td>\n",
" <td>296.0</td>\n",
" <td>15.3</td>\n",
" <td>396.90</td>\n",
" <td>4.98</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>0.02731</td>\n",
" <td>0.0</td>\n",
" <td>7.07</td>\n",
" <td>0.0</td>\n",
" <td>0.469</td>\n",
" <td>6.421</td>\n",
" <td>78.9</td>\n",
" <td>4.9671</td>\n",
" <td>2.0</td>\n",
" <td>242.0</td>\n",
" <td>17.8</td>\n",
" <td>396.90</td>\n",
" <td>9.14</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>0.02729</td>\n",
" <td>0.0</td>\n",
" <td>7.07</td>\n",
" <td>0.0</td>\n",
" <td>0.469</td>\n",
" <td>7.185</td>\n",
" <td>61.1</td>\n",
" <td>4.9671</td>\n",
" <td>2.0</td>\n",
" <td>242.0</td>\n",
" <td>17.8</td>\n",
" <td>392.83</td>\n",
" <td>4.03</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>0.03237</td>\n",
" <td>0.0</td>\n",
" <td>2.18</td>\n",
" <td>0.0</td>\n",
" <td>0.458</td>\n",
" <td>6.998</td>\n",
" <td>45.8</td>\n",
" <td>6.0622</td>\n",
" <td>3.0</td>\n",
" <td>222.0</td>\n",
" <td>18.7</td>\n",
" <td>394.63</td>\n",
" <td>2.94</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>0.06905</td>\n",
" <td>0.0</td>\n",
" <td>2.18</td>\n",
" <td>0.0</td>\n",
" <td>0.458</td>\n",
" <td>7.147</td>\n",
" <td>54.2</td>\n",
" <td>6.0622</td>\n",
" <td>3.0</td>\n",
" <td>222.0</td>\n",
" <td>18.7</td>\n",
" <td>396.90</td>\n",
" <td>5.33</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" CRIM ZN INDUS CHAS NOX RM AGE DIS RAD TAX \\\n",
"0 0.00632 18.0 2.31 0.0 0.538 6.575 65.2 4.0900 1.0 296.0 \n",
"1 0.02731 0.0 7.07 0.0 0.469 6.421 78.9 4.9671 2.0 242.0 \n",
"2 0.02729 0.0 7.07 0.0 0.469 7.185 61.1 4.9671 2.0 242.0 \n",
"3 0.03237 0.0 2.18 0.0 0.458 6.998 45.8 6.0622 3.0 222.0 \n",
"4 0.06905 0.0 2.18 0.0 0.458 7.147 54.2 6.0622 3.0 222.0 \n",
"\n",
" PTRATIO B LSTAT \n",
"0 15.3 396.90 4.98 \n",
"1 17.8 396.90 9.14 \n",
"2 17.8 392.83 4.03 \n",
"3 18.7 394.63 2.94 \n",
"4 18.7 396.90 5.33 "
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"feature_names = boston[\"feature_names\"]\n",
"\n",
"X = pd.DataFrame(data, columns=feature_names)\n",
"X.head()"
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "0a3b3fbd",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>price</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>24.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>21.6</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>34.7</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>33.4</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>36.2</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" price\n",
"0 24.0\n",
"1 21.6\n",
"2 34.7\n",
"3 33.4\n",
"4 36.2"
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"target = boston[\"target\"]\n",
"\n",
"Y = pd.DataFrame(target, columns=[\"price\"])\n",
"Y.head()"
]
},
{
"cell_type": "code",
"execution_count": 6,
"id": "81f5f72a",
"metadata": {},
"outputs": [],
"source": [
"from sklearn.model_selection import train_test_split"
]
},
{
"cell_type": "code",
"execution_count": 7,
"id": "eca2e802",
"metadata": {},
"outputs": [],
"source": [
"X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.30, random_state=42)"
]
},
{
"cell_type": "code",
"execution_count": 8,
"id": "b7fdd109",
"metadata": {},
"outputs": [],
"source": [
"from sklearn.linear_model import LinearRegression"
]
},
{
"cell_type": "code",
"execution_count": 9,
"id": "87bbc227",
"metadata": {},
"outputs": [],
"source": [
"lr = LinearRegression()"
]
},
{
"cell_type": "code",
"execution_count": 10,
"id": "41af6442",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"LinearRegression()"
]
},
"execution_count": 10,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"lr.fit(X_train, Y_train)"
]
},
{
"cell_type": "code",
"execution_count": 11,
"id": "28a67c09",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Y_test</th>\n",
" <th>Y_pred_lr</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>173</th>\n",
" <td>23.6</td>\n",
" <td>28.648960</td>\n",
" </tr>\n",
" <tr>\n",
" <th>274</th>\n",
" <td>32.4</td>\n",
" <td>36.495014</td>\n",
" </tr>\n",
" <tr>\n",
" <th>491</th>\n",
" <td>13.6</td>\n",
" <td>15.411193</td>\n",
" </tr>\n",
" <tr>\n",
" <th>72</th>\n",
" <td>22.8</td>\n",
" <td>25.403213</td>\n",
" </tr>\n",
" <tr>\n",
" <th>452</th>\n",
" <td>16.1</td>\n",
" <td>18.855280</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" Y_test Y_pred_lr\n",
"173 23.6 28.648960\n",
"274 32.4 36.495014\n",
"491 13.6 15.411193\n",
"72 22.8 25.403213\n",
"452 16.1 18.855280"
]
},
"execution_count": 11,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"y_pred_lr = lr.predict(X_test)\n",
"check_test_lr = pd.DataFrame({\n",
" \"Y_test\": Y_test[\"price\"], \n",
" \"Y_pred_lr\": y_pred_lr.flatten()})\n",
"\n",
"check_test_lr.head()"
]
},
{
"cell_type": "code",
"execution_count": 12,
"id": "4a035a94",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"21.517444231176995\n"
]
}
],
"source": [
"from sklearn.metrics import mean_squared_error\n",
"\n",
"mean_squared_error_lr = mean_squared_error(check_test_lr[\"Y_pred_lr\"], check_test_lr[\"Y_test\"])\n",
"print(mean_squared_error_lr)"
]
},
{
"cell_type": "code",
"execution_count": 13,
"id": "2b87195e",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"0.711226005748496"
]
},
"execution_count": 13,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"from sklearn.metrics import r2_score\n",
"\n",
"r2_score(Y_test, y_pred_lr)"
]
},
{
"cell_type": "markdown",
"id": "bbf6dc90",
"metadata": {},
"source": [
"## Задание 2\n",
"\n",
"Создайте модель под названием model с помощью RandomForestRegressor из модуля sklearn.ensemble.\n",
"\n",
"Сделайте агрумент n_estimators равным 1000, max_depth должен быть равен 12 и random_state сделайте равным 42.\n",
"\n",
"Обучите модель на тренировочных данных аналогично тому, как вы обучали модель LinearRegression,\n",
"но при этом в метод fit вместо датафрейма y_train поставьте y_train.values[:, 0],\n",
"чтобы получить из датафрейма одномерный массив Numpy,\n",
"так как для класса RandomForestRegressor в данном методе для аргумента y предпочтительно применение массивов вместо датафрейма.\n",
"\n",
"Сделайте предсказание на тестовых данных и посчитайте R2.\n",
"\n",
"Сравните с результатом из предыдущего задания. Напишите в комментариях к коду, какая модель в данном случае работает лучше."
]
},
{
"cell_type": "code",
"execution_count": 14,
"id": "f8f381fd",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"RandomForestRegressor(max_depth=12, n_estimators=1000, random_state=42)"
]
},
"execution_count": 14,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"from sklearn.ensemble import RandomForestRegressor\n",
"model = RandomForestRegressor(n_estimators=1000, max_depth=12, random_state=42)\n",
"model.fit(X_train, Y_train.values[:, 0])"
]
},
{
"cell_type": "code",
"execution_count": 15,
"id": "c2733e21",
"metadata": {},
"outputs": [],
"source": [
"y_pred_1 = model.predict(X_test)"
]
},
{
"cell_type": "code",
"execution_count": 16,
"id": "ff72edb4",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"0.87472606157312"
]
},
"execution_count": 16,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"r2_score(Y_test, y_pred_1)"
]
},
{
"cell_type": "code",
"execution_count": 17,
"id": "24d68924",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>y_test</th>\n",
" <th>y_pred_lr</th>\n",
" <th>y_pred_rf</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>173</th>\n",
" <td>23.6</td>\n",
" <td>28.648960</td>\n",
" <td>22.806412</td>\n",
" </tr>\n",
" <tr>\n",
" <th>274</th>\n",
" <td>32.4</td>\n",
" <td>36.495014</td>\n",
" <td>31.131464</td>\n",
" </tr>\n",
" <tr>\n",
" <th>491</th>\n",
" <td>13.6</td>\n",
" <td>15.411193</td>\n",
" <td>16.339125</td>\n",
" </tr>\n",
" <tr>\n",
" <th>72</th>\n",
" <td>22.8</td>\n",
" <td>25.403213</td>\n",
" <td>23.810726</td>\n",
" </tr>\n",
" <tr>\n",
" <th>452</th>\n",
" <td>16.1</td>\n",
" <td>18.855280</td>\n",
" <td>17.139521</td>\n",
" </tr>\n",
" <tr>\n",
" <th>76</th>\n",
" <td>20.0</td>\n",
" <td>23.146689</td>\n",
" <td>21.832284</td>\n",
" </tr>\n",
" <tr>\n",
" <th>316</th>\n",
" <td>17.8</td>\n",
" <td>17.392124</td>\n",
" <td>19.895747</td>\n",
" </tr>\n",
" <tr>\n",
" <th>140</th>\n",
" <td>14.0</td>\n",
" <td>14.078599</td>\n",
" <td>14.754118</td>\n",
" </tr>\n",
" <tr>\n",
" <th>471</th>\n",
" <td>19.6</td>\n",
" <td>23.036927</td>\n",
" <td>21.240835</td>\n",
" </tr>\n",
" <tr>\n",
" <th>500</th>\n",
" <td>16.8</td>\n",
" <td>20.599433</td>\n",
" <td>20.898658</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" y_test y_pred_lr y_pred_rf\n",
"173 23.6 28.648960 22.806412\n",
"274 32.4 36.495014 31.131464\n",
"491 13.6 15.411193 16.339125\n",
"72 22.8 25.403213 23.810726\n",
"452 16.1 18.855280 17.139521\n",
"76 20.0 23.146689 21.832284\n",
"316 17.8 17.392124 19.895747\n",
"140 14.0 14.078599 14.754118\n",
"471 19.6 23.036927 21.240835\n",
"500 16.8 20.599433 20.898658"
]
},
"execution_count": 17,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"check_test = pd.DataFrame({\n",
" \"y_test\": Y_test[\"price\"],\n",
" \"y_pred_lr\": y_pred_lr.flatten(),\n",
" \"y_pred_rf\": y_pred_1.flatten(),\n",
"})\n",
"\n",
"check_test.head(10)"
]
},
{
"cell_type": "markdown",
"id": "9f09783f",
"metadata": {},
"source": [
"R2 из первого задания меньше чем R2 во втором задании, а значить у модели построеной с помощью RandomForestRegressor предсказания ближе к тестовым."
]
},
{
"cell_type": "markdown",
"id": "a86c9368",
"metadata": {},
"source": [
"## *Задание 3\n",
"Вызовите документацию для класса RandomForestRegressor, найдите информацию об атрибуте feature_importances_.\n",
"\n",
"С помощью этого атрибута найдите сумму всех показателей важности, установите, какие два признака показывают наибольшую важность."
]
},
{
"cell_type": "code",
"execution_count": 18,
"id": "8acc1978",
"metadata": {},
"outputs": [],
"source": [
"?RandomForestRegressor"
]
},
{
"cell_type": "code",
"execution_count": 19,
"id": "cd674bb4",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[0.03167574 0.00154252 0.00713813 0.00123624 0.01426897 0.40268179\n",
" 0.01429864 0.06397257 0.00528122 0.01152493 0.01808108 0.01245085\n",
" 0.41584732]\n"
]
}
],
"source": [
"print(model.feature_importances_)"
]
},
{
"cell_type": "code",
"execution_count": 20,
"id": "1e1dbef5",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>feature_importance</th>\n",
" <th>name</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>0.031676</td>\n",
" <td>CRIM</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>0.001543</td>\n",
" <td>ZN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>0.007138</td>\n",
" <td>INDUS</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>0.001236</td>\n",
" <td>CHAS</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>0.014269</td>\n",
" <td>NOX</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5</th>\n",
" <td>0.402682</td>\n",
" <td>RM</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6</th>\n",
" <td>0.014299</td>\n",
" <td>AGE</td>\n",
" </tr>\n",
" <tr>\n",
" <th>7</th>\n",
" <td>0.063973</td>\n",
" <td>DIS</td>\n",
" </tr>\n",
" <tr>\n",
" <th>8</th>\n",
" <td>0.005281</td>\n",
" <td>RAD</td>\n",
" </tr>\n",
" <tr>\n",
" <th>9</th>\n",
" <td>0.011525</td>\n",
" <td>TAX</td>\n",
" </tr>\n",
" <tr>\n",
" <th>10</th>\n",
" <td>0.018081</td>\n",
" <td>PTRATIO</td>\n",
" </tr>\n",
" <tr>\n",
" <th>11</th>\n",
" <td>0.012451</td>\n",
" <td>B</td>\n",
" </tr>\n",
" <tr>\n",
" <th>12</th>\n",
" <td>0.415847</td>\n",
" <td>LSTAT</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" feature_importance name\n",
"0 0.031676 CRIM\n",
"1 0.001543 ZN\n",
"2 0.007138 INDUS\n",
"3 0.001236 CHAS\n",
"4 0.014269 NOX\n",
"5 0.402682 RM\n",
"6 0.014299 AGE\n",
"7 0.063973 DIS\n",
"8 0.005281 RAD\n",
"9 0.011525 TAX\n",
"10 0.018081 PTRATIO\n",
"11 0.012451 B\n",
"12 0.415847 LSTAT"
]
},
"execution_count": 20,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"feature_importance = pd.DataFrame({'name':X.columns, \n",
" 'feature_importance':model.feature_importances_}, \n",
" columns=['feature_importance', 'name'])\n",
"feature_importance"
]
},
{
"cell_type": "markdown",
"id": "3bf5bd9e",
"metadata": {},
"source": [
"Два признака показываюoие наибольшую важность:"
]
},
{
"cell_type": "code",
"execution_count": 21,
"id": "82439470",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>feature_importance</th>\n",
" <th>name</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>12</th>\n",
" <td>0.415847</td>\n",
" <td>LSTAT</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5</th>\n",
" <td>0.402682</td>\n",
" <td>RM</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" feature_importance name\n",
"12 0.415847 LSTAT\n",
"5 0.402682 RM"
]
},
"execution_count": 21,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"feature_importance.nlargest(2, 'feature_importance')"
]
},
{
"cell_type": "markdown",
"id": "9c507bb6",
"metadata": {},
"source": [
"Сумма показателей важности:"
]
},
{
"cell_type": "code",
"execution_count": 22,
"id": "2aae8d49",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"1.0"
]
},
"execution_count": 22,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"model.feature_importances_.sum()"
]
},
{
"cell_type": "markdown",
"id": "3e9baeab",
"metadata": {},
"source": [
"## *Задание 4\n",
"\n",
"В этом задании мы будем работать с датасетом, с которым мы уже знакомы по домашнему заданию по библиотеке Matplotlib, это датасет Credit Card Fraud Detection.Для этого датасета мы будем решать задачу классификации - будем определять,какие из транзакциции по кредитной карте являются мошенническими.Данный датасет сильно несбалансирован (так как случаи мошенничества относительно редки),так что применение метрики accuracy не принесет пользы и не поможет выбрать лучшую модель.Мы будем вычислять AUC, то есть площадь под кривой ROC.\n",
"\n",
"Импортируйте из соответствующих модулей RandomForestClassifier, GridSearchCV и train_test_split.\n",
"\n",
"Загрузите датасет creditcard.csv и создайте датафрейм df.\n",
"\n",
"С помощью метода value_counts с аргументом normalize=True убедитесь в том, что выборка несбалансирована.Используя метод info, проверьте, все ли столбцы содержат числовые данные и нет ли в них пропусков. Примените следующую настройку, чтобы можно было просматривать все столбцы датафрейма: pd.options.display.max_columns = 100.\n",
"\n",
"Просмотрите первые 10 строк датафрейма df.\n",
"\n",
"Создайте датафрейм X из датафрейма df, исключив столбец Class.\n",
"\n",
"Создайте объект Series под названием y из столбца Class.\n",
"\n",
"Разбейте X и y на тренировочный и тестовый наборы данных при помощи функции train_test_split, используя аргументы: test_size=0.3, random_state=100, stratify=y. У вас должны получиться объекты X_train, X_test, y_train и y_test.\n",
"\n",
"Просмотрите информацию о их форме. Для поиска по сетке параметров задайте такие параметры: parameters = [{'n_estimators': [10, 15],'max_features': np.arange(3, 5),'max_depth': np.arange(4, 7)}]\n",
"\n",
"Создайте модель GridSearchCV со следующими аргументами: estimator=RandomForestClassifier(random_state=100), param_grid=parameters, scoring='roc_auc', cv=3.\n",
"\n",
"Обучите модель на тренировочном наборе данных (может занять несколько минут).\n",
"\n",
"Просмотрите параметры лучшей модели с помощью атрибута best_params_.\n",
"\n",
"Предскажите вероятности классов с помощью полученнной модели и метода predict_proba.\n",
"\n",
"Из полученного результата (массив Numpy) выберите столбец с индексом 1 (вероятность класса 1) и запишите в массив y_pred_proba.\n",
"\n",
"Из модуля sklearn.metrics импортируйте метрику roc_auc_score.\n",
"\n",
"Вычислите AUC на тестовых данных и сравните с результатом,полученным на тренировочных данных, используя в качестве аргументовмассивы y_test и y_pred_proba."
]
},
{
"cell_type": "code",
"execution_count": 23,
"id": "c890d00a",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Time</th>\n",
" <th>V1</th>\n",
" <th>V2</th>\n",
" <th>V3</th>\n",
" <th>V4</th>\n",
" <th>V5</th>\n",
" <th>V6</th>\n",
" <th>V7</th>\n",
" <th>V8</th>\n",
" <th>V9</th>\n",
" <th>...</th>\n",
" <th>V21</th>\n",
" <th>V22</th>\n",
" <th>V23</th>\n",
" <th>V24</th>\n",
" <th>V25</th>\n",
" <th>V26</th>\n",
" <th>V27</th>\n",
" <th>V28</th>\n",
" <th>Amount</th>\n",
" <th>Class</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>0.0</td>\n",
" <td>-1.359807</td>\n",
" <td>-0.072781</td>\n",
" <td>2.536347</td>\n",
" <td>1.378155</td>\n",
" <td>-0.338321</td>\n",
" <td>0.462388</td>\n",
" <td>0.239599</td>\n",
" <td>0.098698</td>\n",
" <td>0.363787</td>\n",
" <td>...</td>\n",
" <td>-0.018307</td>\n",
" <td>0.277838</td>\n",
" <td>-0.110474</td>\n",
" <td>0.066928</td>\n",
" <td>0.128539</td>\n",
" <td>-0.189115</td>\n",
" <td>0.133558</td>\n",
" <td>-0.021053</td>\n",
" <td>149.62</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>0.0</td>\n",
" <td>1.191857</td>\n",
" <td>0.266151</td>\n",
" <td>0.166480</td>\n",
" <td>0.448154</td>\n",
" <td>0.060018</td>\n",
" <td>-0.082361</td>\n",
" <td>-0.078803</td>\n",
" <td>0.085102</td>\n",
" <td>-0.255425</td>\n",
" <td>...</td>\n",
" <td>-0.225775</td>\n",
" <td>-0.638672</td>\n",
" <td>0.101288</td>\n",
" <td>-0.339846</td>\n",
" <td>0.167170</td>\n",
" <td>0.125895</td>\n",
" <td>-0.008983</td>\n",
" <td>0.014724</td>\n",
" <td>2.69</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>1.0</td>\n",
" <td>-1.358354</td>\n",
" <td>-1.340163</td>\n",
" <td>1.773209</td>\n",
" <td>0.379780</td>\n",
" <td>-0.503198</td>\n",
" <td>1.800499</td>\n",
" <td>0.791461</td>\n",
" <td>0.247676</td>\n",
" <td>-1.514654</td>\n",
" <td>...</td>\n",
" <td>0.247998</td>\n",
" <td>0.771679</td>\n",
" <td>0.909412</td>\n",
" <td>-0.689281</td>\n",
" <td>-0.327642</td>\n",
" <td>-0.139097</td>\n",
" <td>-0.055353</td>\n",
" <td>-0.059752</td>\n",
" <td>378.66</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>1.0</td>\n",
" <td>-0.966272</td>\n",
" <td>-0.185226</td>\n",
" <td>1.792993</td>\n",
" <td>-0.863291</td>\n",
" <td>-0.010309</td>\n",
" <td>1.247203</td>\n",
" <td>0.237609</td>\n",
" <td>0.377436</td>\n",
" <td>-1.387024</td>\n",
" <td>...</td>\n",
" <td>-0.108300</td>\n",
" <td>0.005274</td>\n",
" <td>-0.190321</td>\n",
" <td>-1.175575</td>\n",
" <td>0.647376</td>\n",
" <td>-0.221929</td>\n",
" <td>0.062723</td>\n",
" <td>0.061458</td>\n",
" <td>123.50</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>2.0</td>\n",
" <td>-1.158233</td>\n",
" <td>0.877737</td>\n",
" <td>1.548718</td>\n",
" <td>0.403034</td>\n",
" <td>-0.407193</td>\n",
" <td>0.095921</td>\n",
" <td>0.592941</td>\n",
" <td>-0.270533</td>\n",
" <td>0.817739</td>\n",
" <td>...</td>\n",
" <td>-0.009431</td>\n",
" <td>0.798278</td>\n",
" <td>-0.137458</td>\n",
" <td>0.141267</td>\n",
" <td>-0.206010</td>\n",
" <td>0.502292</td>\n",
" <td>0.219422</td>\n",
" <td>0.215153</td>\n",
" <td>69.99</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5</th>\n",
" <td>2.0</td>\n",
" <td>-0.425966</td>\n",
" <td>0.960523</td>\n",
" <td>1.141109</td>\n",
" <td>-0.168252</td>\n",
" <td>0.420987</td>\n",
" <td>-0.029728</td>\n",
" <td>0.476201</td>\n",
" <td>0.260314</td>\n",
" <td>-0.568671</td>\n",
" <td>...</td>\n",
" <td>-0.208254</td>\n",
" <td>-0.559825</td>\n",
" <td>-0.026398</td>\n",
" <td>-0.371427</td>\n",
" <td>-0.232794</td>\n",
" <td>0.105915</td>\n",
" <td>0.253844</td>\n",
" <td>0.081080</td>\n",
" <td>3.67</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6</th>\n",
" <td>4.0</td>\n",
" <td>1.229658</td>\n",
" <td>0.141004</td>\n",
" <td>0.045371</td>\n",
" <td>1.202613</td>\n",
" <td>0.191881</td>\n",
" <td>0.272708</td>\n",
" <td>-0.005159</td>\n",
" <td>0.081213</td>\n",
" <td>0.464960</td>\n",
" <td>...</td>\n",
" <td>-0.167716</td>\n",
" <td>-0.270710</td>\n",
" <td>-0.154104</td>\n",
" <td>-0.780055</td>\n",
" <td>0.750137</td>\n",
" <td>-0.257237</td>\n",
" <td>0.034507</td>\n",
" <td>0.005168</td>\n",
" <td>4.99</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>7</th>\n",
" <td>7.0</td>\n",
" <td>-0.644269</td>\n",
" <td>1.417964</td>\n",
" <td>1.074380</td>\n",
" <td>-0.492199</td>\n",
" <td>0.948934</td>\n",
" <td>0.428118</td>\n",
" <td>1.120631</td>\n",
" <td>-3.807864</td>\n",
" <td>0.615375</td>\n",
" <td>...</td>\n",
" <td>1.943465</td>\n",
" <td>-1.015455</td>\n",
" <td>0.057504</td>\n",
" <td>-0.649709</td>\n",
" <td>-0.415267</td>\n",
" <td>-0.051634</td>\n",
" <td>-1.206921</td>\n",
" <td>-1.085339</td>\n",
" <td>40.80</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>8</th>\n",
" <td>7.0</td>\n",
" <td>-0.894286</td>\n",
" <td>0.286157</td>\n",
" <td>-0.113192</td>\n",
" <td>-0.271526</td>\n",
" <td>2.669599</td>\n",
" <td>3.721818</td>\n",
" <td>0.370145</td>\n",
" <td>0.851084</td>\n",
" <td>-0.392048</td>\n",
" <td>...</td>\n",
" <td>-0.073425</td>\n",
" <td>-0.268092</td>\n",
" <td>-0.204233</td>\n",
" <td>1.011592</td>\n",
" <td>0.373205</td>\n",
" <td>-0.384157</td>\n",
" <td>0.011747</td>\n",
" <td>0.142404</td>\n",
" <td>93.20</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>9</th>\n",
" <td>9.0</td>\n",
" <td>-0.338262</td>\n",
" <td>1.119593</td>\n",
" <td>1.044367</td>\n",
" <td>-0.222187</td>\n",
" <td>0.499361</td>\n",
" <td>-0.246761</td>\n",
" <td>0.651583</td>\n",
" <td>0.069539</td>\n",
" <td>-0.736727</td>\n",
" <td>...</td>\n",
" <td>-0.246914</td>\n",
" <td>-0.633753</td>\n",
" <td>-0.120794</td>\n",
" <td>-0.385050</td>\n",
" <td>-0.069733</td>\n",
" <td>0.094199</td>\n",
" <td>0.246219</td>\n",
" <td>0.083076</td>\n",
" <td>3.68</td>\n",
" <td>0</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>10 rows × 31 columns</p>\n",
"</div>"
],
"text/plain": [
" Time V1 V2 V3 V4 V5 V6 V7 \\\n",
"0 0.0 -1.359807 -0.072781 2.536347 1.378155 -0.338321 0.462388 0.239599 \n",
"1 0.0 1.191857 0.266151 0.166480 0.448154 0.060018 -0.082361 -0.078803 \n",
"2 1.0 -1.358354 -1.340163 1.773209 0.379780 -0.503198 1.800499 0.791461 \n",
"3 1.0 -0.966272 -0.185226 1.792993 -0.863291 -0.010309 1.247203 0.237609 \n",
"4 2.0 -1.158233 0.877737 1.548718 0.403034 -0.407193 0.095921 0.592941 \n",
"5 2.0 -0.425966 0.960523 1.141109 -0.168252 0.420987 -0.029728 0.476201 \n",
"6 4.0 1.229658 0.141004 0.045371 1.202613 0.191881 0.272708 -0.005159 \n",
"7 7.0 -0.644269 1.417964 1.074380 -0.492199 0.948934 0.428118 1.120631 \n",
"8 7.0 -0.894286 0.286157 -0.113192 -0.271526 2.669599 3.721818 0.370145 \n",
"9 9.0 -0.338262 1.119593 1.044367 -0.222187 0.499361 -0.246761 0.651583 \n",
"\n",
" V8 V9 ... V21 V22 V23 V24 V25 \\\n",
"0 0.098698 0.363787 ... -0.018307 0.277838 -0.110474 0.066928 0.128539 \n",
"1 0.085102 -0.255425 ... -0.225775 -0.638672 0.101288 -0.339846 0.167170 \n",
"2 0.247676 -1.514654 ... 0.247998 0.771679 0.909412 -0.689281 -0.327642 \n",
"3 0.377436 -1.387024 ... -0.108300 0.005274 -0.190321 -1.175575 0.647376 \n",
"4 -0.270533 0.817739 ... -0.009431 0.798278 -0.137458 0.141267 -0.206010 \n",
"5 0.260314 -0.568671 ... -0.208254 -0.559825 -0.026398 -0.371427 -0.232794 \n",
"6 0.081213 0.464960 ... -0.167716 -0.270710 -0.154104 -0.780055 0.750137 \n",
"7 -3.807864 0.615375 ... 1.943465 -1.015455 0.057504 -0.649709 -0.415267 \n",
"8 0.851084 -0.392048 ... -0.073425 -0.268092 -0.204233 1.011592 0.373205 \n",
"9 0.069539 -0.736727 ... -0.246914 -0.633753 -0.120794 -0.385050 -0.069733 \n",
"\n",
" V26 V27 V28 Amount Class \n",
"0 -0.189115 0.133558 -0.021053 149.62 0 \n",
"1 0.125895 -0.008983 0.014724 2.69 0 \n",
"2 -0.139097 -0.055353 -0.059752 378.66 0 \n",
"3 -0.221929 0.062723 0.061458 123.50 0 \n",
"4 0.502292 0.219422 0.215153 69.99 0 \n",
"5 0.105915 0.253844 0.081080 3.67 0 \n",
"6 -0.257237 0.034507 0.005168 4.99 0 \n",
"7 -0.051634 -1.206921 -1.085339 40.80 0 \n",
"8 -0.384157 0.011747 0.142404 93.20 0 \n",
"9 0.094199 0.246219 0.083076 3.68 0 \n",
"\n",
"[10 rows x 31 columns]"
]
},
"execution_count": 23,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"from sklearn.ensemble import RandomForestClassifier\n",
"from sklearn.model_selection import GridSearchCV\n",
"from sklearn.model_selection import train_test_split\n",
"\n",
"df = pd.read_csv('creditcard.csv')\n",
"df.head(10)"
]
},
{
"cell_type": "code",
"execution_count": 24,
"id": "0201a188",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"0 0.998273\n",
"1 0.001727\n",
"Name: Class, dtype: float64"
]
},
"execution_count": 24,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df['Class'].value_counts(normalize=True)"
]
},
{
"cell_type": "code",
"execution_count": 25,
"id": "9ffa6a96",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"<class 'pandas.core.frame.DataFrame'>\n",
"RangeIndex: 284807 entries, 0 to 284806\n",
"Data columns (total 31 columns):\n",
" # Column Non-Null Count Dtype \n",
"--- ------ -------------- ----- \n",
" 0 Time 284807 non-null float64\n",
" 1 V1 284807 non-null float64\n",
" 2 V2 284807 non-null float64\n",
" 3 V3 284807 non-null float64\n",
" 4 V4 284807 non-null float64\n",
" 5 V5 284807 non-null float64\n",
" 6 V6 284807 non-null float64\n",
" 7 V7 284807 non-null float64\n",
" 8 V8 284807 non-null float64\n",
" 9 V9 284807 non-null float64\n",
" 10 V10 284807 non-null float64\n",
" 11 V11 284807 non-null float64\n",
" 12 V12 284807 non-null float64\n",
" 13 V13 284807 non-null float64\n",
" 14 V14 284807 non-null float64\n",
" 15 V15 284807 non-null float64\n",
" 16 V16 284807 non-null float64\n",
" 17 V17 284807 non-null float64\n",
" 18 V18 284807 non-null float64\n",
" 19 V19 284807 non-null float64\n",
" 20 V20 284807 non-null float64\n",
" 21 V21 284807 non-null float64\n",
" 22 V22 284807 non-null float64\n",
" 23 V23 284807 non-null float64\n",
" 24 V24 284807 non-null float64\n",
" 25 V25 284807 non-null float64\n",
" 26 V26 284807 non-null float64\n",
" 27 V27 284807 non-null float64\n",
" 28 V28 284807 non-null float64\n",
" 29 Amount 284807 non-null float64\n",
" 30 Class 284807 non-null int64 \n",
"dtypes: float64(30), int64(1)\n",
"memory usage: 67.4 MB\n"
]
}
],
"source": [
"df.info()"
]
},
{
"cell_type": "code",
"execution_count": 26,
"id": "8d3439e3",
"metadata": {},
"outputs": [],
"source": [
"pd.options.display.max_columns=100"
]
},
{
"cell_type": "code",
"execution_count": 27,
"id": "0f57a690",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Time</th>\n",
" <th>V1</th>\n",
" <th>V2</th>\n",
" <th>V3</th>\n",
" <th>V4</th>\n",
" <th>V5</th>\n",
" <th>V6</th>\n",
" <th>V7</th>\n",
" <th>V8</th>\n",
" <th>V9</th>\n",
" <th>V10</th>\n",
" <th>V11</th>\n",
" <th>V12</th>\n",
" <th>V13</th>\n",
" <th>V14</th>\n",
" <th>V15</th>\n",
" <th>V16</th>\n",
" <th>V17</th>\n",
" <th>V18</th>\n",
" <th>V19</th>\n",
" <th>V20</th>\n",
" <th>V21</th>\n",
" <th>V22</th>\n",
" <th>V23</th>\n",
" <th>V24</th>\n",
" <th>V25</th>\n",
" <th>V26</th>\n",
" <th>V27</th>\n",
" <th>V28</th>\n",
" <th>Amount</th>\n",
" <th>Class</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>0.0</td>\n",
" <td>-1.359807</td>\n",
" <td>-0.072781</td>\n",
" <td>2.536347</td>\n",
" <td>1.378155</td>\n",
" <td>-0.338321</td>\n",
" <td>0.462388</td>\n",
" <td>0.239599</td>\n",
" <td>0.098698</td>\n",
" <td>0.363787</td>\n",
" <td>0.090794</td>\n",
" <td>-0.551600</td>\n",
" <td>-0.617801</td>\n",
" <td>-0.991390</td>\n",
" <td>-0.311169</td>\n",
" <td>1.468177</td>\n",
" <td>-0.470401</td>\n",
" <td>0.207971</td>\n",
" <td>0.025791</td>\n",
" <td>0.403993</td>\n",
" <td>0.251412</td>\n",
" <td>-0.018307</td>\n",
" <td>0.277838</td>\n",
" <td>-0.110474</td>\n",
" <td>0.066928</td>\n",
" <td>0.128539</td>\n",
" <td>-0.189115</td>\n",
" <td>0.133558</td>\n",
" <td>-0.021053</td>\n",
" <td>149.62</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>0.0</td>\n",
" <td>1.191857</td>\n",
" <td>0.266151</td>\n",
" <td>0.166480</td>\n",
" <td>0.448154</td>\n",
" <td>0.060018</td>\n",
" <td>-0.082361</td>\n",
" <td>-0.078803</td>\n",
" <td>0.085102</td>\n",
" <td>-0.255425</td>\n",
" <td>-0.166974</td>\n",
" <td>1.612727</td>\n",
" <td>1.065235</td>\n",
" <td>0.489095</td>\n",
" <td>-0.143772</td>\n",
" <td>0.635558</td>\n",
" <td>0.463917</td>\n",
" <td>-0.114805</td>\n",
" <td>-0.183361</td>\n",
" <td>-0.145783</td>\n",
" <td>-0.069083</td>\n",
" <td>-0.225775</td>\n",
" <td>-0.638672</td>\n",
" <td>0.101288</td>\n",
" <td>-0.339846</td>\n",
" <td>0.167170</td>\n",
" <td>0.125895</td>\n",
" <td>-0.008983</td>\n",
" <td>0.014724</td>\n",
" <td>2.69</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>1.0</td>\n",
" <td>-1.358354</td>\n",
" <td>-1.340163</td>\n",
" <td>1.773209</td>\n",
" <td>0.379780</td>\n",
" <td>-0.503198</td>\n",
" <td>1.800499</td>\n",
" <td>0.791461</td>\n",
" <td>0.247676</td>\n",
" <td>-1.514654</td>\n",
" <td>0.207643</td>\n",
" <td>0.624501</td>\n",
" <td>0.066084</td>\n",
" <td>0.717293</td>\n",
" <td>-0.165946</td>\n",
" <td>2.345865</td>\n",
" <td>-2.890083</td>\n",
" <td>1.109969</td>\n",
" <td>-0.121359</td>\n",
" <td>-2.261857</td>\n",
" <td>0.524980</td>\n",
" <td>0.247998</td>\n",
" <td>0.771679</td>\n",
" <td>0.909412</td>\n",
" <td>-0.689281</td>\n",
" <td>-0.327642</td>\n",
" <td>-0.139097</td>\n",
" <td>-0.055353</td>\n",
" <td>-0.059752</td>\n",
" <td>378.66</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>1.0</td>\n",
" <td>-0.966272</td>\n",
" <td>-0.185226</td>\n",
" <td>1.792993</td>\n",
" <td>-0.863291</td>\n",
" <td>-0.010309</td>\n",
" <td>1.247203</td>\n",
" <td>0.237609</td>\n",
" <td>0.377436</td>\n",
" <td>-1.387024</td>\n",
" <td>-0.054952</td>\n",
" <td>-0.226487</td>\n",
" <td>0.178228</td>\n",
" <td>0.507757</td>\n",
" <td>-0.287924</td>\n",
" <td>-0.631418</td>\n",
" <td>-1.059647</td>\n",
" <td>-0.684093</td>\n",
" <td>1.965775</td>\n",
" <td>-1.232622</td>\n",
" <td>-0.208038</td>\n",
" <td>-0.108300</td>\n",
" <td>0.005274</td>\n",
" <td>-0.190321</td>\n",
" <td>-1.175575</td>\n",
" <td>0.647376</td>\n",
" <td>-0.221929</td>\n",
" <td>0.062723</td>\n",
" <td>0.061458</td>\n",
" <td>123.50</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>2.0</td>\n",
" <td>-1.158233</td>\n",
" <td>0.877737</td>\n",
" <td>1.548718</td>\n",
" <td>0.403034</td>\n",
" <td>-0.407193</td>\n",
" <td>0.095921</td>\n",
" <td>0.592941</td>\n",
" <td>-0.270533</td>\n",
" <td>0.817739</td>\n",
" <td>0.753074</td>\n",
" <td>-0.822843</td>\n",
" <td>0.538196</td>\n",
" <td>1.345852</td>\n",
" <td>-1.119670</td>\n",
" <td>0.175121</td>\n",
" <td>-0.451449</td>\n",
" <td>-0.237033</td>\n",
" <td>-0.038195</td>\n",
" <td>0.803487</td>\n",
" <td>0.408542</td>\n",
" <td>-0.009431</td>\n",
" <td>0.798278</td>\n",
" <td>-0.137458</td>\n",
" <td>0.141267</td>\n",
" <td>-0.206010</td>\n",
" <td>0.502292</td>\n",
" <td>0.219422</td>\n",
" <td>0.215153</td>\n",
" <td>69.99</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5</th>\n",
" <td>2.0</td>\n",
" <td>-0.425966</td>\n",
" <td>0.960523</td>\n",
" <td>1.141109</td>\n",
" <td>-0.168252</td>\n",
" <td>0.420987</td>\n",
" <td>-0.029728</td>\n",
" <td>0.476201</td>\n",
" <td>0.260314</td>\n",
" <td>-0.568671</td>\n",
" <td>-0.371407</td>\n",
" <td>1.341262</td>\n",
" <td>0.359894</td>\n",
" <td>-0.358091</td>\n",
" <td>-0.137134</td>\n",
" <td>0.517617</td>\n",
" <td>0.401726</td>\n",
" <td>-0.058133</td>\n",
" <td>0.068653</td>\n",
" <td>-0.033194</td>\n",
" <td>0.084968</td>\n",
" <td>-0.208254</td>\n",
" <td>-0.559825</td>\n",
" <td>-0.026398</td>\n",
" <td>-0.371427</td>\n",
" <td>-0.232794</td>\n",
" <td>0.105915</td>\n",
" <td>0.253844</td>\n",
" <td>0.081080</td>\n",
" <td>3.67</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6</th>\n",
" <td>4.0</td>\n",
" <td>1.229658</td>\n",
" <td>0.141004</td>\n",
" <td>0.045371</td>\n",
" <td>1.202613</td>\n",
" <td>0.191881</td>\n",
" <td>0.272708</td>\n",
" <td>-0.005159</td>\n",
" <td>0.081213</td>\n",
" <td>0.464960</td>\n",
" <td>-0.099254</td>\n",
" <td>-1.416907</td>\n",
" <td>-0.153826</td>\n",
" <td>-0.751063</td>\n",
" <td>0.167372</td>\n",
" <td>0.050144</td>\n",
" <td>-0.443587</td>\n",
" <td>0.002821</td>\n",
" <td>-0.611987</td>\n",
" <td>-0.045575</td>\n",
" <td>-0.219633</td>\n",
" <td>-0.167716</td>\n",
" <td>-0.270710</td>\n",
" <td>-0.154104</td>\n",
" <td>-0.780055</td>\n",
" <td>0.750137</td>\n",
" <td>-0.257237</td>\n",
" <td>0.034507</td>\n",
" <td>0.005168</td>\n",
" <td>4.99</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>7</th>\n",
" <td>7.0</td>\n",
" <td>-0.644269</td>\n",
" <td>1.417964</td>\n",
" <td>1.074380</td>\n",
" <td>-0.492199</td>\n",
" <td>0.948934</td>\n",
" <td>0.428118</td>\n",
" <td>1.120631</td>\n",
" <td>-3.807864</td>\n",
" <td>0.615375</td>\n",
" <td>1.249376</td>\n",
" <td>-0.619468</td>\n",
" <td>0.291474</td>\n",
" <td>1.757964</td>\n",
" <td>-1.323865</td>\n",
" <td>0.686133</td>\n",
" <td>-0.076127</td>\n",
" <td>-1.222127</td>\n",
" <td>-0.358222</td>\n",
" <td>0.324505</td>\n",
" <td>-0.156742</td>\n",
" <td>1.943465</td>\n",
" <td>-1.015455</td>\n",
" <td>0.057504</td>\n",
" <td>-0.649709</td>\n",
" <td>-0.415267</td>\n",
" <td>-0.051634</td>\n",
" <td>-1.206921</td>\n",
" <td>-1.085339</td>\n",
" <td>40.80</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>8</th>\n",
" <td>7.0</td>\n",
" <td>-0.894286</td>\n",
" <td>0.286157</td>\n",
" <td>-0.113192</td>\n",
" <td>-0.271526</td>\n",
" <td>2.669599</td>\n",
" <td>3.721818</td>\n",
" <td>0.370145</td>\n",
" <td>0.851084</td>\n",
" <td>-0.392048</td>\n",
" <td>-0.410430</td>\n",
" <td>-0.705117</td>\n",
" <td>-0.110452</td>\n",
" <td>-0.286254</td>\n",
" <td>0.074355</td>\n",
" <td>-0.328783</td>\n",
" <td>-0.210077</td>\n",
" <td>-0.499768</td>\n",
" <td>0.118765</td>\n",
" <td>0.570328</td>\n",
" <td>0.052736</td>\n",
" <td>-0.073425</td>\n",
" <td>-0.268092</td>\n",
" <td>-0.204233</td>\n",
" <td>1.011592</td>\n",
" <td>0.373205</td>\n",
" <td>-0.384157</td>\n",
" <td>0.011747</td>\n",
" <td>0.142404</td>\n",
" <td>93.20</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>9</th>\n",
" <td>9.0</td>\n",
" <td>-0.338262</td>\n",
" <td>1.119593</td>\n",
" <td>1.044367</td>\n",
" <td>-0.222187</td>\n",
" <td>0.499361</td>\n",
" <td>-0.246761</td>\n",
" <td>0.651583</td>\n",
" <td>0.069539</td>\n",
" <td>-0.736727</td>\n",
" <td>-0.366846</td>\n",
" <td>1.017614</td>\n",
" <td>0.836390</td>\n",
" <td>1.006844</td>\n",
" <td>-0.443523</td>\n",
" <td>0.150219</td>\n",
" <td>0.739453</td>\n",
" <td>-0.540980</td>\n",
" <td>0.476677</td>\n",
" <td>0.451773</td>\n",
" <td>0.203711</td>\n",
" <td>-0.246914</td>\n",
" <td>-0.633753</td>\n",
" <td>-0.120794</td>\n",
" <td>-0.385050</td>\n",
" <td>-0.069733</td>\n",
" <td>0.094199</td>\n",
" <td>0.246219</td>\n",
" <td>0.083076</td>\n",
" <td>3.68</td>\n",
" <td>0</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" Time V1 V2 V3 V4 V5 V6 V7 \\\n",
"0 0.0 -1.359807 -0.072781 2.536347 1.378155 -0.338321 0.462388 0.239599 \n",
"1 0.0 1.191857 0.266151 0.166480 0.448154 0.060018 -0.082361 -0.078803 \n",
"2 1.0 -1.358354 -1.340163 1.773209 0.379780 -0.503198 1.800499 0.791461 \n",
"3 1.0 -0.966272 -0.185226 1.792993 -0.863291 -0.010309 1.247203 0.237609 \n",
"4 2.0 -1.158233 0.877737 1.548718 0.403034 -0.407193 0.095921 0.592941 \n",
"5 2.0 -0.425966 0.960523 1.141109 -0.168252 0.420987 -0.029728 0.476201 \n",
"6 4.0 1.229658 0.141004 0.045371 1.202613 0.191881 0.272708 -0.005159 \n",
"7 7.0 -0.644269 1.417964 1.074380 -0.492199 0.948934 0.428118 1.120631 \n",
"8 7.0 -0.894286 0.286157 -0.113192 -0.271526 2.669599 3.721818 0.370145 \n",
"9 9.0 -0.338262 1.119593 1.044367 -0.222187 0.499361 -0.246761 0.651583 \n",
"\n",
" V8 V9 V10 V11 V12 V13 V14 \\\n",
"0 0.098698 0.363787 0.090794 -0.551600 -0.617801 -0.991390 -0.311169 \n",
"1 0.085102 -0.255425 -0.166974 1.612727 1.065235 0.489095 -0.143772 \n",
"2 0.247676 -1.514654 0.207643 0.624501 0.066084 0.717293 -0.165946 \n",
"3 0.377436 -1.387024 -0.054952 -0.226487 0.178228 0.507757 -0.287924 \n",
"4 -0.270533 0.817739 0.753074 -0.822843 0.538196 1.345852 -1.119670 \n",
"5 0.260314 -0.568671 -0.371407 1.341262 0.359894 -0.358091 -0.137134 \n",
"6 0.081213 0.464960 -0.099254 -1.416907 -0.153826 -0.751063 0.167372 \n",
"7 -3.807864 0.615375 1.249376 -0.619468 0.291474 1.757964 -1.323865 \n",
"8 0.851084 -0.392048 -0.410430 -0.705117 -0.110452 -0.286254 0.074355 \n",
"9 0.069539 -0.736727 -0.366846 1.017614 0.836390 1.006844 -0.443523 \n",
"\n",
" V15 V16 V17 V18 V19 V20 V21 \\\n",
"0 1.468177 -0.470401 0.207971 0.025791 0.403993 0.251412 -0.018307 \n",
"1 0.635558 0.463917 -0.114805 -0.183361 -0.145783 -0.069083 -0.225775 \n",
"2 2.345865 -2.890083 1.109969 -0.121359 -2.261857 0.524980 0.247998 \n",
"3 -0.631418 -1.059647 -0.684093 1.965775 -1.232622 -0.208038 -0.108300 \n",
"4 0.175121 -0.451449 -0.237033 -0.038195 0.803487 0.408542 -0.009431 \n",
"5 0.517617 0.401726 -0.058133 0.068653 -0.033194 0.084968 -0.208254 \n",
"6 0.050144 -0.443587 0.002821 -0.611987 -0.045575 -0.219633 -0.167716 \n",
"7 0.686133 -0.076127 -1.222127 -0.358222 0.324505 -0.156742 1.943465 \n",
"8 -0.328783 -0.210077 -0.499768 0.118765 0.570328 0.052736 -0.073425 \n",
"9 0.150219 0.739453 -0.540980 0.476677 0.451773 0.203711 -0.246914 \n",
"\n",
" V22 V23 V24 V25 V26 V27 V28 \\\n",
"0 0.277838 -0.110474 0.066928 0.128539 -0.189115 0.133558 -0.021053 \n",
"1 -0.638672 0.101288 -0.339846 0.167170 0.125895 -0.008983 0.014724 \n",
"2 0.771679 0.909412 -0.689281 -0.327642 -0.139097 -0.055353 -0.059752 \n",
"3 0.005274 -0.190321 -1.175575 0.647376 -0.221929 0.062723 0.061458 \n",
"4 0.798278 -0.137458 0.141267 -0.206010 0.502292 0.219422 0.215153 \n",
"5 -0.559825 -0.026398 -0.371427 -0.232794 0.105915 0.253844 0.081080 \n",
"6 -0.270710 -0.154104 -0.780055 0.750137 -0.257237 0.034507 0.005168 \n",
"7 -1.015455 0.057504 -0.649709 -0.415267 -0.051634 -1.206921 -1.085339 \n",
"8 -0.268092 -0.204233 1.011592 0.373205 -0.384157 0.011747 0.142404 \n",
"9 -0.633753 -0.120794 -0.385050 -0.069733 0.094199 0.246219 0.083076 \n",
"\n",
" Amount Class \n",
"0 149.62 0 \n",
"1 2.69 0 \n",
"2 378.66 0 \n",
"3 123.50 0 \n",
"4 69.99 0 \n",
"5 3.67 0 \n",
"6 4.99 0 \n",
"7 40.80 0 \n",
"8 93.20 0 \n",
"9 3.68 0 "
]
},
"execution_count": 27,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.head(10)"
]
},
{
"cell_type": "code",
"execution_count": 28,
"id": "3530430c",
"metadata": {},
"outputs": [],
"source": [
"X = df.drop(\"Class\", axis=1)\n",
"y = df[\"Class\"]"
]
},
{
"cell_type": "code",
"execution_count": 29,
"id": "c66ea2e2",
"metadata": {},
"outputs": [],
"source": [
"X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=100, stratify=y)"
]
},
{
"cell_type": "code",
"execution_count": 30,
"id": "1e628dd0",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"X_train (199364, 30)\n",
"X_test (85443, 30)\n",
"y_train (199364,)\n",
"y_test (85443,)\n"
]
}
],
"source": [
"print('X_train ', X_train.shape)\n",
"print('X_test ', X_test.shape)\n",
"print('y_train ', y_train.shape)\n",
"print('y_test ', y_test.shape)"
]
},
{
"cell_type": "code",
"execution_count": 31,
"id": "8908147f",
"metadata": {},
"outputs": [],
"source": [
"parameters = [{'n_estimators': [10, 15],'max_features': np.arange(3, 5),'max_depth': np.arange(4, 7)}]"
]
},
{
"cell_type": "code",
"execution_count": 32,
"id": "8f593bef",
"metadata": {},
"outputs": [],
"source": [
"clf = GridSearchCV(\n",
" estimator=RandomForestClassifier(random_state=100),\n",
" param_grid=parameters,\n",
" scoring='roc_auc',\n",
" cv=3,\n",
")"
]
},
{
"cell_type": "code",
"execution_count": 33,
"id": "99d17337",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"GridSearchCV(cv=3, estimator=RandomForestClassifier(random_state=100),\n",
" param_grid=[{'max_depth': array([4, 5, 6]),\n",
" 'max_features': array([3, 4]),\n",
" 'n_estimators': [10, 15]}],\n",
" scoring='roc_auc')"
]
},
"execution_count": 33,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"clf.fit(X_train, y_train)"
]
},
{
"cell_type": "code",
"execution_count": 34,
"id": "79f8c7e5",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"{'max_depth': 6, 'max_features': 3, 'n_estimators': 15}"
]
},
"execution_count": 34,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"clf.best_params_"
]
},
{
"cell_type": "code",
"execution_count": 35,
"id": "74efab0a",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"RandomForestClassifier(max_depth=6, max_features=3, n_estimators=15)"
]
},
"execution_count": 35,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"clf = RandomForestClassifier(max_depth=6, max_features=3, n_estimators=15)\n",
"\n",
"clf.fit(X_train, y_train)"
]
},
{
"cell_type": "code",
"execution_count": 36,
"id": "e4d1fe4a",
"metadata": {},
"outputs": [],
"source": [
"y_pred = clf.predict_proba(X_test)"
]
},
{
"cell_type": "code",
"execution_count": 37,
"id": "49357f79",
"metadata": {},
"outputs": [],
"source": [
"y_pred_proba = y_pred[:, 1]"
]
},
{
"cell_type": "code",
"execution_count": 38,
"id": "fc40ec74",
"metadata": {},
"outputs": [],
"source": [
"from sklearn.metrics import roc_auc_score"
]
},
{
"cell_type": "code",
"execution_count": 39,
"id": "84b0112b",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"0.9476239854368701"
]
},
"execution_count": 39,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"roc_auc_score(y_test, y_pred_proba)"
]
},
{
"cell_type": "markdown",
"id": "475ac08f",
"metadata": {},
"source": [
"# *Дополнительные задания:"
]
},
{
"cell_type": "markdown",
"id": "3728fe05",
"metadata": {},
"source": [
"Загрузите датасет Wine из встроенных датасетов sklearn.datasets с помощью функции load_wine в переменную data."
]
},
{
"cell_type": "code",
"execution_count": 40,
"id": "e0600074",
"metadata": {},
"outputs": [],
"source": [
"from sklearn.datasets import load_wine\n",
"data = load_wine()"
]
},
{
"cell_type": "markdown",
"id": "6e22dfd0",
"metadata": {},
"source": [
"Полученный датасет не является датафреймом. Это структура данных, имеющая ключи аналогично словарю. Просмотрите тип данных этой структуры данных и создайте список data_keys, содержащий ее ключи."
]
},
{
"cell_type": "code",
"execution_count": 41,
"id": "1cc31a29",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"<class 'sklearn.utils.Bunch'> \n",
"\n",
"dict_keys(['data', 'target', 'frame', 'target_names', 'DESCR', 'feature_names'])\n"
]
}
],
"source": [
"print(type(data), '\\n')\n",
"data_keys = data.keys()\n",
"print(data_keys)"
]
},
{
"cell_type": "markdown",
"id": "05b0491c",
"metadata": {},
"source": [
"Просмотрите данные, описание и названия признаков в датасете. Описание нужно вывести в виде привычного, аккуратно оформленного текста, без обозначений переноса строки, но с самими переносами и т.д"
]
},
{
"cell_type": "code",
"execution_count": 42,
"id": "67997daa",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"array([[1.423e+01, 1.710e+00, 2.430e+00, ..., 1.040e+00, 3.920e+00,\n",
" 1.065e+03],\n",
" [1.320e+01, 1.780e+00, 2.140e+00, ..., 1.050e+00, 3.400e+00,\n",
" 1.050e+03],\n",
" [1.316e+01, 2.360e+00, 2.670e+00, ..., 1.030e+00, 3.170e+00,\n",
" 1.185e+03],\n",
" ...,\n",
" [1.327e+01, 4.280e+00, 2.260e+00, ..., 5.900e-01, 1.560e+00,\n",
" 8.350e+02],\n",
" [1.317e+01, 2.590e+00, 2.370e+00, ..., 6.000e-01, 1.620e+00,\n",
" 8.400e+02],\n",
" [1.413e+01, 4.100e+00, 2.740e+00, ..., 6.100e-01, 1.600e+00,\n",
" 5.600e+02]])"
]
},
"execution_count": 42,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"data.data"
]
},
{
"cell_type": "code",
"execution_count": 43,
"id": "42bf0f1f",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
".. _wine_dataset:\n",
"\n",
"Wine recognition dataset\n",
"------------------------\n",
"\n",
"**Data Set Characteristics:**\n",
"\n",
" :Number of Instances: 178 (50 in each of three classes)\n",
" :Number of Attributes: 13 numeric, predictive attributes and the class\n",
" :Attribute Information:\n",
" \t\t- Alcohol\n",
" \t\t- Malic acid\n",
" \t\t- Ash\n",
"\t\t- Alcalinity of ash \n",
" \t\t- Magnesium\n",
"\t\t- Total phenols\n",
" \t\t- Flavanoids\n",
" \t\t- Nonflavanoid phenols\n",
" \t\t- Proanthocyanins\n",
"\t\t- Color intensity\n",
" \t\t- Hue\n",
" \t\t- OD280/OD315 of diluted wines\n",
" \t\t- Proline\n",
"\n",
" - class:\n",
" - class_0\n",
" - class_1\n",
" - class_2\n",
"\t\t\n",
" :Summary Statistics:\n",
" \n",
" ============================= ==== ===== ======= =====\n",
" Min Max Mean SD\n",
" ============================= ==== ===== ======= =====\n",
" Alcohol: 11.0 14.8 13.0 0.8\n",
" Malic Acid: 0.74 5.80 2.34 1.12\n",
" Ash: 1.36 3.23 2.36 0.27\n",
" Alcalinity of Ash: 10.6 30.0 19.5 3.3\n",
" Magnesium: 70.0 162.0 99.7 14.3\n",
" Total Phenols: 0.98 3.88 2.29 0.63\n",
" Flavanoids: 0.34 5.08 2.03 1.00\n",
" Nonflavanoid Phenols: 0.13 0.66 0.36 0.12\n",
" Proanthocyanins: 0.41 3.58 1.59 0.57\n",
" Colour Intensity: 1.3 13.0 5.1 2.3\n",
" Hue: 0.48 1.71 0.96 0.23\n",
" OD280/OD315 of diluted wines: 1.27 4.00 2.61 0.71\n",
" Proline: 278 1680 746 315\n",
" ============================= ==== ===== ======= =====\n",
"\n",
" :Missing Attribute Values: None\n",
" :Class Distribution: class_0 (59), class_1 (71), class_2 (48)\n",
" :Creator: R.A. Fisher\n",
" :Donor: Michael Marshall (MARSHALL%PLU@io.arc.nasa.gov)\n",
" :Date: July, 1988\n",
"\n",
"This is a copy of UCI ML Wine recognition datasets.\n",
"https://archive.ics.uci.edu/ml/machine-learning-databases/wine/wine.data\n",
"\n",
"The data is the results of a chemical analysis of wines grown in the same\n",
"region in Italy by three different cultivators. There are thirteen different\n",
"measurements taken for different constituents found in the three types of\n",
"wine.\n",
"\n",
"Original Owners: \n",
"\n",
"Forina, M. et al, PARVUS - \n",
"An Extendible Package for Data Exploration, Classification and Correlation. \n",
"Institute of Pharmaceutical and Food Analysis and Technologies,\n",
"Via Brigata Salerno, 16147 Genoa, Italy.\n",
"\n",
"Citation:\n",
"\n",
"Lichman, M. (2013). UCI Machine Learning Repository\n",
"[https://archive.ics.uci.edu/ml]. Irvine, CA: University of California,\n",
"School of Information and Computer Science. \n",
"\n",
".. topic:: References\n",
"\n",
" (1) S. Aeberhard, D. Coomans and O. de Vel, \n",
" Comparison of Classifiers in High Dimensional Settings, \n",
" Tech. Rep. no. 92-02, (1992), Dept. of Computer Science and Dept. of \n",
" Mathematics and Statistics, James Cook University of North Queensland. \n",
" (Also submitted to Technometrics). \n",
"\n",
" The data was used with many others for comparing various \n",
" classifiers. The classes are separable, though only RDA \n",
" has achieved 100% correct classification. \n",
" (RDA : 100%, QDA 99.4%, LDA 98.9%, 1NN 96.1% (z-transformed data)) \n",
" (All results using the leave-one-out technique) \n",
"\n",
" (2) S. Aeberhard, D. Coomans and O. de Vel, \n",
" \"THE CLASSIFICATION PERFORMANCE OF RDA\" \n",
" Tech. Rep. no. 92-01, (1992), Dept. of Computer Science and Dept. of \n",
" Mathematics and Statistics, James Cook University of North Queensland. \n",
" (Also submitted to Journal of Chemometrics).\n",
"\n"
]
}
],
"source": [
"print(data.DESCR)"
]
},
{
"cell_type": "code",
"execution_count": 44,
"id": "3990394e",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"['alcohol',\n",
" 'malic_acid',\n",
" 'ash',\n",
" 'alcalinity_of_ash',\n",
" 'magnesium',\n",
" 'total_phenols',\n",
" 'flavanoids',\n",
" 'nonflavanoid_phenols',\n",
" 'proanthocyanins',\n",
" 'color_intensity',\n",
" 'hue',\n",
" 'od280/od315_of_diluted_wines',\n",
" 'proline']"
]
},
"execution_count": 44,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"data.feature_names"
]
},
{
"cell_type": "markdown",
"id": "ca3e3b90",
"metadata": {},
"source": [
"Сколько классов содержит целевая переменная датасета? Выведите названия классов."
]
},
{
"cell_type": "code",
"execution_count": 45,
"id": "3dcc2473",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Количество классов: (3,) \n",
"\n",
"Названия классов: ['class_0' 'class_1' 'class_2']\n"
]
}
],
"source": [
"print('Количество классов: ', np.unique(data[\"target\"]).shape, '\\n')\n",
"print('Названия классов: ',data[\"target_names\"])"
]
},
{
"cell_type": "markdown",
"id": "4eb0d981",
"metadata": {},
"source": [
"На основе данных датасета (они содержатся в двумерном массиве Numpy) и названий признаков создайте датафрейм под названием X."
]
},
{
"cell_type": "code",
"execution_count": 46,
"id": "52257354",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>alcohol</th>\n",
" <th>malic_acid</th>\n",
" <th>ash</th>\n",
" <th>alcalinity_of_ash</th>\n",
" <th>magnesium</th>\n",
" <th>total_phenols</th>\n",
" <th>flavanoids</th>\n",
" <th>nonflavanoid_phenols</th>\n",
" <th>proanthocyanins</th>\n",
" <th>color_intensity</th>\n",
" <th>hue</th>\n",
" <th>od280/od315_of_diluted_wines</th>\n",
" <th>proline</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>14.23</td>\n",
" <td>1.71</td>\n",
" <td>2.43</td>\n",
" <td>15.6</td>\n",
" <td>127.0</td>\n",
" <td>2.80</td>\n",
" <td>3.06</td>\n",
" <td>0.28</td>\n",
" <td>2.29</td>\n",
" <td>5.64</td>\n",
" <td>1.04</td>\n",
" <td>3.92</td>\n",
" <td>1065.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>13.20</td>\n",
" <td>1.78</td>\n",
" <td>2.14</td>\n",
" <td>11.2</td>\n",
" <td>100.0</td>\n",
" <td>2.65</td>\n",
" <td>2.76</td>\n",
" <td>0.26</td>\n",
" <td>1.28</td>\n",
" <td>4.38</td>\n",
" <td>1.05</td>\n",
" <td>3.40</td>\n",
" <td>1050.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>13.16</td>\n",
" <td>2.36</td>\n",
" <td>2.67</td>\n",
" <td>18.6</td>\n",
" <td>101.0</td>\n",
" <td>2.80</td>\n",
" <td>3.24</td>\n",
" <td>0.30</td>\n",
" <td>2.81</td>\n",
" <td>5.68</td>\n",
" <td>1.03</td>\n",
" <td>3.17</td>\n",
" <td>1185.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>14.37</td>\n",
" <td>1.95</td>\n",
" <td>2.50</td>\n",
" <td>16.8</td>\n",
" <td>113.0</td>\n",
" <td>3.85</td>\n",
" <td>3.49</td>\n",
" <td>0.24</td>\n",
" <td>2.18</td>\n",
" <td>7.80</td>\n",
" <td>0.86</td>\n",
" <td>3.45</td>\n",
" <td>1480.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>13.24</td>\n",
" <td>2.59</td>\n",
" <td>2.87</td>\n",
" <td>21.0</td>\n",
" <td>118.0</td>\n",
" <td>2.80</td>\n",
" <td>2.69</td>\n",
" <td>0.39</td>\n",
" <td>1.82</td>\n",
" <td>4.32</td>\n",
" <td>1.04</td>\n",
" <td>2.93</td>\n",
" <td>735.0</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" alcohol malic_acid ash alcalinity_of_ash magnesium total_phenols \\\n",
"0 14.23 1.71 2.43 15.6 127.0 2.80 \n",
"1 13.20 1.78 2.14 11.2 100.0 2.65 \n",
"2 13.16 2.36 2.67 18.6 101.0 2.80 \n",
"3 14.37 1.95 2.50 16.8 113.0 3.85 \n",
"4 13.24 2.59 2.87 21.0 118.0 2.80 \n",
"\n",
" flavanoids nonflavanoid_phenols proanthocyanins color_intensity hue \\\n",
"0 3.06 0.28 2.29 5.64 1.04 \n",
"1 2.76 0.26 1.28 4.38 1.05 \n",
"2 3.24 0.30 2.81 5.68 1.03 \n",
"3 3.49 0.24 2.18 7.80 0.86 \n",
"4 2.69 0.39 1.82 4.32 1.04 \n",
"\n",
" od280/od315_of_diluted_wines proline \n",
"0 3.92 1065.0 \n",
"1 3.40 1050.0 \n",
"2 3.17 1185.0 \n",
"3 3.45 1480.0 \n",
"4 2.93 735.0 "
]
},
"execution_count": 46,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"X = pd.DataFrame(data.data, columns=data.feature_names)\n",
"X.head()"
]
},
{
"cell_type": "markdown",
"id": "917c33ed",
"metadata": {},
"source": [
"Выясните размер датафрейма X и установите, имеются ли в нем пропущенные значения."
]
},
{
"cell_type": "code",
"execution_count": 47,
"id": "f66d1569",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"(178, 13)"
]
},
"execution_count": 47,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"X.shape"
]
},
{
"cell_type": "code",
"execution_count": 48,
"id": "4a1379f8",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"<class 'pandas.core.frame.DataFrame'>\n",
"RangeIndex: 178 entries, 0 to 177\n",
"Data columns (total 13 columns):\n",
" # Column Non-Null Count Dtype \n",
"--- ------ -------------- ----- \n",
" 0 alcohol 178 non-null float64\n",
" 1 malic_acid 178 non-null float64\n",
" 2 ash 178 non-null float64\n",
" 3 alcalinity_of_ash 178 non-null float64\n",
" 4 magnesium 178 non-null float64\n",
" 5 total_phenols 178 non-null float64\n",
" 6 flavanoids 178 non-null float64\n",
" 7 nonflavanoid_phenols 178 non-null float64\n",
" 8 proanthocyanins 178 non-null float64\n",
" 9 color_intensity 178 non-null float64\n",
" 10 hue 178 non-null float64\n",
" 11 od280/od315_of_diluted_wines 178 non-null float64\n",
" 12 proline 178 non-null float64\n",
"dtypes: float64(13)\n",
"memory usage: 18.2 KB\n"
]
}
],
"source": [
"X.info()"
]
},
{
"cell_type": "code",
"execution_count": 49,
"id": "f5573521",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"alcohol 0\n",
"malic_acid 0\n",
"ash 0\n",
"alcalinity_of_ash 0\n",
"magnesium 0\n",
"total_phenols 0\n",
"flavanoids 0\n",
"nonflavanoid_phenols 0\n",
"proanthocyanins 0\n",
"color_intensity 0\n",
"hue 0\n",
"od280/od315_of_diluted_wines 0\n",
"proline 0\n",
"dtype: int64"
]
},
"execution_count": 49,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"X.isnull().astype(\"int\").sum()"
]
},
{
"cell_type": "markdown",
"id": "7fcfb081",
"metadata": {},
"source": [
"Добавьте в датафрейм поле с классами вин в виде чисел, имеющих тип данных numpy.int64. Название поля - 'target'."
]
},
{
"cell_type": "code",
"execution_count": 50,
"id": "89d0aa13",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"<class 'pandas.core.frame.DataFrame'>\n",
"RangeIndex: 178 entries, 0 to 177\n",
"Data columns (total 14 columns):\n",
" # Column Non-Null Count Dtype \n",
"--- ------ -------------- ----- \n",
" 0 alcohol 178 non-null float64\n",
" 1 malic_acid 178 non-null float64\n",
" 2 ash 178 non-null float64\n",
" 3 alcalinity_of_ash 178 non-null float64\n",
" 4 magnesium 178 non-null float64\n",
" 5 total_phenols 178 non-null float64\n",
" 6 flavanoids 178 non-null float64\n",
" 7 nonflavanoid_phenols 178 non-null float64\n",
" 8 proanthocyanins 178 non-null float64\n",
" 9 color_intensity 178 non-null float64\n",
" 10 hue 178 non-null float64\n",
" 11 od280/od315_of_diluted_wines 178 non-null float64\n",
" 12 proline 178 non-null float64\n",
" 13 target 178 non-null int64 \n",
"dtypes: float64(13), int64(1)\n",
"memory usage: 19.6 KB\n"
]
}
],
"source": [
"X[\"target\"]=data[\"target\"].astype(np.int64)\n",
"X.info()"
]
},
{
"cell_type": "code",
"execution_count": 51,
"id": "50bcdef6",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>alcohol</th>\n",
" <th>malic_acid</th>\n",
" <th>ash</th>\n",
" <th>alcalinity_of_ash</th>\n",
" <th>magnesium</th>\n",
" <th>total_phenols</th>\n",
" <th>flavanoids</th>\n",
" <th>nonflavanoid_phenols</th>\n",
" <th>proanthocyanins</th>\n",
" <th>color_intensity</th>\n",
" <th>hue</th>\n",
" <th>od280/od315_of_diluted_wines</th>\n",
" <th>proline</th>\n",
" <th>target</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>14.23</td>\n",
" <td>1.71</td>\n",
" <td>2.43</td>\n",
" <td>15.6</td>\n",
" <td>127.0</td>\n",
" <td>2.80</td>\n",
" <td>3.06</td>\n",
" <td>0.28</td>\n",
" <td>2.29</td>\n",
" <td>5.64</td>\n",
" <td>1.04</td>\n",
" <td>3.92</td>\n",
" <td>1065.0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>13.20</td>\n",
" <td>1.78</td>\n",
" <td>2.14</td>\n",
" <td>11.2</td>\n",
" <td>100.0</td>\n",
" <td>2.65</td>\n",
" <td>2.76</td>\n",
" <td>0.26</td>\n",
" <td>1.28</td>\n",
" <td>4.38</td>\n",
" <td>1.05</td>\n",
" <td>3.40</td>\n",
" <td>1050.0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>13.16</td>\n",
" <td>2.36</td>\n",
" <td>2.67</td>\n",
" <td>18.6</td>\n",
" <td>101.0</td>\n",
" <td>2.80</td>\n",
" <td>3.24</td>\n",
" <td>0.30</td>\n",
" <td>2.81</td>\n",
" <td>5.68</td>\n",
" <td>1.03</td>\n",
" <td>3.17</td>\n",
" <td>1185.0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>14.37</td>\n",
" <td>1.95</td>\n",
" <td>2.50</td>\n",
" <td>16.8</td>\n",
" <td>113.0</td>\n",
" <td>3.85</td>\n",
" <td>3.49</td>\n",
" <td>0.24</td>\n",
" <td>2.18</td>\n",
" <td>7.80</td>\n",
" <td>0.86</td>\n",
" <td>3.45</td>\n",
" <td>1480.0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>13.24</td>\n",
" <td>2.59</td>\n",
" <td>2.87</td>\n",
" <td>21.0</td>\n",
" <td>118.0</td>\n",
" <td>2.80</td>\n",
" <td>2.69</td>\n",
" <td>0.39</td>\n",
" <td>1.82</td>\n",
" <td>4.32</td>\n",
" <td>1.04</td>\n",
" <td>2.93</td>\n",
" <td>735.0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" alcohol malic_acid ash alcalinity_of_ash magnesium total_phenols \\\n",
"0 14.23 1.71 2.43 15.6 127.0 2.80 \n",
"1 13.20 1.78 2.14 11.2 100.0 2.65 \n",
"2 13.16 2.36 2.67 18.6 101.0 2.80 \n",
"3 14.37 1.95 2.50 16.8 113.0 3.85 \n",
"4 13.24 2.59 2.87 21.0 118.0 2.80 \n",
"\n",
" flavanoids nonflavanoid_phenols proanthocyanins color_intensity hue \\\n",
"0 3.06 0.28 2.29 5.64 1.04 \n",
"1 2.76 0.26 1.28 4.38 1.05 \n",
"2 3.24 0.30 2.81 5.68 1.03 \n",
"3 3.49 0.24 2.18 7.80 0.86 \n",
"4 2.69 0.39 1.82 4.32 1.04 \n",
"\n",
" od280/od315_of_diluted_wines proline target \n",
"0 3.92 1065.0 0 \n",
"1 3.40 1050.0 0 \n",
"2 3.17 1185.0 0 \n",
"3 3.45 1480.0 0 \n",
"4 2.93 735.0 0 "
]
},
"execution_count": 51,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"X.head()"
]
},
{
"cell_type": "markdown",
"id": "704ea79c",
"metadata": {},
"source": [
"Постройте матрицу корреляций для всех полей X. Дайте полученному датафрейму название X_corr."
]
},
{
"cell_type": "code",
"execution_count": 52,
"id": "41d5c34c",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>alcohol</th>\n",
" <th>malic_acid</th>\n",
" <th>ash</th>\n",
" <th>alcalinity_of_ash</th>\n",
" <th>magnesium</th>\n",
" <th>total_phenols</th>\n",
" <th>flavanoids</th>\n",
" <th>nonflavanoid_phenols</th>\n",
" <th>proanthocyanins</th>\n",
" <th>color_intensity</th>\n",
" <th>hue</th>\n",
" <th>od280/od315_of_diluted_wines</th>\n",
" <th>proline</th>\n",
" <th>target</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>alcohol</th>\n",
" <td>1.000000</td>\n",
" <td>0.094397</td>\n",
" <td>0.211545</td>\n",
" <td>-0.310235</td>\n",
" <td>0.270798</td>\n",
" <td>0.289101</td>\n",
" <td>0.236815</td>\n",
" <td>-0.155929</td>\n",
" <td>0.136698</td>\n",
" <td>0.546364</td>\n",
" <td>-0.071747</td>\n",
" <td>0.072343</td>\n",
" <td>0.643720</td>\n",
" <td>-0.328222</td>\n",
" </tr>\n",
" <tr>\n",
" <th>malic_acid</th>\n",
" <td>0.094397</td>\n",
" <td>1.000000</td>\n",
" <td>0.164045</td>\n",
" <td>0.288500</td>\n",
" <td>-0.054575</td>\n",
" <td>-0.335167</td>\n",
" <td>-0.411007</td>\n",
" <td>0.292977</td>\n",
" <td>-0.220746</td>\n",
" <td>0.248985</td>\n",
" <td>-0.561296</td>\n",
" <td>-0.368710</td>\n",
" <td>-0.192011</td>\n",
" <td>0.437776</td>\n",
" </tr>\n",
" <tr>\n",
" <th>ash</th>\n",
" <td>0.211545</td>\n",
" <td>0.164045</td>\n",
" <td>1.000000</td>\n",
" <td>0.443367</td>\n",
" <td>0.286587</td>\n",
" <td>0.128980</td>\n",
" <td>0.115077</td>\n",
" <td>0.186230</td>\n",
" <td>0.009652</td>\n",
" <td>0.258887</td>\n",
" <td>-0.074667</td>\n",
" <td>0.003911</td>\n",
" <td>0.223626</td>\n",
" <td>-0.049643</td>\n",
" </tr>\n",
" <tr>\n",
" <th>alcalinity_of_ash</th>\n",
" <td>-0.310235</td>\n",
" <td>0.288500</td>\n",
" <td>0.443367</td>\n",
" <td>1.000000</td>\n",
" <td>-0.083333</td>\n",
" <td>-0.321113</td>\n",
" <td>-0.351370</td>\n",
" <td>0.361922</td>\n",
" <td>-0.197327</td>\n",
" <td>0.018732</td>\n",
" <td>-0.273955</td>\n",
" <td>-0.276769</td>\n",
" <td>-0.440597</td>\n",
" <td>0.517859</td>\n",
" </tr>\n",
" <tr>\n",
" <th>magnesium</th>\n",
" <td>0.270798</td>\n",
" <td>-0.054575</td>\n",
" <td>0.286587</td>\n",
" <td>-0.083333</td>\n",
" <td>1.000000</td>\n",
" <td>0.214401</td>\n",
" <td>0.195784</td>\n",
" <td>-0.256294</td>\n",
" <td>0.236441</td>\n",
" <td>0.199950</td>\n",
" <td>0.055398</td>\n",
" <td>0.066004</td>\n",
" <td>0.393351</td>\n",
" <td>-0.209179</td>\n",
" </tr>\n",
" <tr>\n",
" <th>total_phenols</th>\n",
" <td>0.289101</td>\n",
" <td>-0.335167</td>\n",
" <td>0.128980</td>\n",
" <td>-0.321113</td>\n",
" <td>0.214401</td>\n",
" <td>1.000000</td>\n",
" <td>0.864564</td>\n",
" <td>-0.449935</td>\n",
" <td>0.612413</td>\n",
" <td>-0.055136</td>\n",
" <td>0.433681</td>\n",
" <td>0.699949</td>\n",
" <td>0.498115</td>\n",
" <td>-0.719163</td>\n",
" </tr>\n",
" <tr>\n",
" <th>flavanoids</th>\n",
" <td>0.236815</td>\n",
" <td>-0.411007</td>\n",
" <td>0.115077</td>\n",
" <td>-0.351370</td>\n",
" <td>0.195784</td>\n",
" <td>0.864564</td>\n",
" <td>1.000000</td>\n",
" <td>-0.537900</td>\n",
" <td>0.652692</td>\n",
" <td>-0.172379</td>\n",
" <td>0.543479</td>\n",
" <td>0.787194</td>\n",
" <td>0.494193</td>\n",
" <td>-0.847498</td>\n",
" </tr>\n",
" <tr>\n",
" <th>nonflavanoid_phenols</th>\n",
" <td>-0.155929</td>\n",
" <td>0.292977</td>\n",
" <td>0.186230</td>\n",
" <td>0.361922</td>\n",
" <td>-0.256294</td>\n",
" <td>-0.449935</td>\n",
" <td>-0.537900</td>\n",
" <td>1.000000</td>\n",
" <td>-0.365845</td>\n",
" <td>0.139057</td>\n",
" <td>-0.262640</td>\n",
" <td>-0.503270</td>\n",
" <td>-0.311385</td>\n",
" <td>0.489109</td>\n",
" </tr>\n",
" <tr>\n",
" <th>proanthocyanins</th>\n",
" <td>0.136698</td>\n",
" <td>-0.220746</td>\n",
" <td>0.009652</td>\n",
" <td>-0.197327</td>\n",
" <td>0.236441</td>\n",
" <td>0.612413</td>\n",
" <td>0.652692</td>\n",
" <td>-0.365845</td>\n",
" <td>1.000000</td>\n",
" <td>-0.025250</td>\n",
" <td>0.295544</td>\n",
" <td>0.519067</td>\n",
" <td>0.330417</td>\n",
" <td>-0.499130</td>\n",
" </tr>\n",
" <tr>\n",
" <th>color_intensity</th>\n",
" <td>0.546364</td>\n",
" <td>0.248985</td>\n",
" <td>0.258887</td>\n",
" <td>0.018732</td>\n",
" <td>0.199950</td>\n",
" <td>-0.055136</td>\n",
" <td>-0.172379</td>\n",
" <td>0.139057</td>\n",
" <td>-0.025250</td>\n",
" <td>1.000000</td>\n",
" <td>-0.521813</td>\n",
" <td>-0.428815</td>\n",
" <td>0.316100</td>\n",
" <td>0.265668</td>\n",
" </tr>\n",
" <tr>\n",
" <th>hue</th>\n",
" <td>-0.071747</td>\n",
" <td>-0.561296</td>\n",
" <td>-0.074667</td>\n",
" <td>-0.273955</td>\n",
" <td>0.055398</td>\n",
" <td>0.433681</td>\n",
" <td>0.543479</td>\n",
" <td>-0.262640</td>\n",
" <td>0.295544</td>\n",
" <td>-0.521813</td>\n",
" <td>1.000000</td>\n",
" <td>0.565468</td>\n",
" <td>0.236183</td>\n",
" <td>-0.617369</td>\n",
" </tr>\n",
" <tr>\n",
" <th>od280/od315_of_diluted_wines</th>\n",
" <td>0.072343</td>\n",
" <td>-0.368710</td>\n",
" <td>0.003911</td>\n",
" <td>-0.276769</td>\n",
" <td>0.066004</td>\n",
" <td>0.699949</td>\n",
" <td>0.787194</td>\n",
" <td>-0.503270</td>\n",
" <td>0.519067</td>\n",
" <td>-0.428815</td>\n",
" <td>0.565468</td>\n",
" <td>1.000000</td>\n",
" <td>0.312761</td>\n",
" <td>-0.788230</td>\n",
" </tr>\n",
" <tr>\n",
" <th>proline</th>\n",
" <td>0.643720</td>\n",
" <td>-0.192011</td>\n",
" <td>0.223626</td>\n",
" <td>-0.440597</td>\n",
" <td>0.393351</td>\n",
" <td>0.498115</td>\n",
" <td>0.494193</td>\n",
" <td>-0.311385</td>\n",
" <td>0.330417</td>\n",
" <td>0.316100</td>\n",
" <td>0.236183</td>\n",
" <td>0.312761</td>\n",
" <td>1.000000</td>\n",
" <td>-0.633717</td>\n",
" </tr>\n",
" <tr>\n",
" <th>target</th>\n",
" <td>-0.328222</td>\n",
" <td>0.437776</td>\n",
" <td>-0.049643</td>\n",
" <td>0.517859</td>\n",
" <td>-0.209179</td>\n",
" <td>-0.719163</td>\n",
" <td>-0.847498</td>\n",
" <td>0.489109</td>\n",
" <td>-0.499130</td>\n",
" <td>0.265668</td>\n",
" <td>-0.617369</td>\n",
" <td>-0.788230</td>\n",
" <td>-0.633717</td>\n",
" <td>1.000000</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" alcohol malic_acid ash \\\n",
"alcohol 1.000000 0.094397 0.211545 \n",
"malic_acid 0.094397 1.000000 0.164045 \n",
"ash 0.211545 0.164045 1.000000 \n",
"alcalinity_of_ash -0.310235 0.288500 0.443367 \n",
"magnesium 0.270798 -0.054575 0.286587 \n",
"total_phenols 0.289101 -0.335167 0.128980 \n",
"flavanoids 0.236815 -0.411007 0.115077 \n",
"nonflavanoid_phenols -0.155929 0.292977 0.186230 \n",
"proanthocyanins 0.136698 -0.220746 0.009652 \n",
"color_intensity 0.546364 0.248985 0.258887 \n",
"hue -0.071747 -0.561296 -0.074667 \n",
"od280/od315_of_diluted_wines 0.072343 -0.368710 0.003911 \n",
"proline 0.643720 -0.192011 0.223626 \n",
"target -0.328222 0.437776 -0.049643 \n",
"\n",
" alcalinity_of_ash magnesium total_phenols \\\n",
"alcohol -0.310235 0.270798 0.289101 \n",
"malic_acid 0.288500 -0.054575 -0.335167 \n",
"ash 0.443367 0.286587 0.128980 \n",
"alcalinity_of_ash 1.000000 -0.083333 -0.321113 \n",
"magnesium -0.083333 1.000000 0.214401 \n",
"total_phenols -0.321113 0.214401 1.000000 \n",
"flavanoids -0.351370 0.195784 0.864564 \n",
"nonflavanoid_phenols 0.361922 -0.256294 -0.449935 \n",
"proanthocyanins -0.197327 0.236441 0.612413 \n",
"color_intensity 0.018732 0.199950 -0.055136 \n",
"hue -0.273955 0.055398 0.433681 \n",
"od280/od315_of_diluted_wines -0.276769 0.066004 0.699949 \n",
"proline -0.440597 0.393351 0.498115 \n",
"target 0.517859 -0.209179 -0.719163 \n",
"\n",
" flavanoids nonflavanoid_phenols \\\n",
"alcohol 0.236815 -0.155929 \n",
"malic_acid -0.411007 0.292977 \n",
"ash 0.115077 0.186230 \n",
"alcalinity_of_ash -0.351370 0.361922 \n",
"magnesium 0.195784 -0.256294 \n",
"total_phenols 0.864564 -0.449935 \n",
"flavanoids 1.000000 -0.537900 \n",
"nonflavanoid_phenols -0.537900 1.000000 \n",
"proanthocyanins 0.652692 -0.365845 \n",
"color_intensity -0.172379 0.139057 \n",
"hue 0.543479 -0.262640 \n",
"od280/od315_of_diluted_wines 0.787194 -0.503270 \n",
"proline 0.494193 -0.311385 \n",
"target -0.847498 0.489109 \n",
"\n",
" proanthocyanins color_intensity hue \\\n",
"alcohol 0.136698 0.546364 -0.071747 \n",
"malic_acid -0.220746 0.248985 -0.561296 \n",
"ash 0.009652 0.258887 -0.074667 \n",
"alcalinity_of_ash -0.197327 0.018732 -0.273955 \n",
"magnesium 0.236441 0.199950 0.055398 \n",
"total_phenols 0.612413 -0.055136 0.433681 \n",
"flavanoids 0.652692 -0.172379 0.543479 \n",
"nonflavanoid_phenols -0.365845 0.139057 -0.262640 \n",
"proanthocyanins 1.000000 -0.025250 0.295544 \n",
"color_intensity -0.025250 1.000000 -0.521813 \n",
"hue 0.295544 -0.521813 1.000000 \n",
"od280/od315_of_diluted_wines 0.519067 -0.428815 0.565468 \n",
"proline 0.330417 0.316100 0.236183 \n",
"target -0.499130 0.265668 -0.617369 \n",
"\n",
" od280/od315_of_diluted_wines proline target \n",
"alcohol 0.072343 0.643720 -0.328222 \n",
"malic_acid -0.368710 -0.192011 0.437776 \n",
"ash 0.003911 0.223626 -0.049643 \n",
"alcalinity_of_ash -0.276769 -0.440597 0.517859 \n",
"magnesium 0.066004 0.393351 -0.209179 \n",
"total_phenols 0.699949 0.498115 -0.719163 \n",
"flavanoids 0.787194 0.494193 -0.847498 \n",
"nonflavanoid_phenols -0.503270 -0.311385 0.489109 \n",
"proanthocyanins 0.519067 0.330417 -0.499130 \n",
"color_intensity -0.428815 0.316100 0.265668 \n",
"hue 0.565468 0.236183 -0.617369 \n",
"od280/od315_of_diluted_wines 1.000000 0.312761 -0.788230 \n",
"proline 0.312761 1.000000 -0.633717 \n",
"target -0.788230 -0.633717 1.000000 "
]
},
"execution_count": 52,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"X_corr = X.corr()\n",
"X_corr"
]
},
{
"cell_type": "markdown",
"id": "1ed7c122",
"metadata": {},
"source": [
"Создайте список high_corr из признаков, корреляция которых с полем target по абсолютному значению превышает 0.5 (причем, само поле target не должно входить в этот список)."
]
},
{
"cell_type": "code",
"execution_count": 53,
"id": "6edf6763",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"Index(['alcalinity_of_ash', 'total_phenols', 'flavanoids', 'hue',\n",
" 'od280/od315_of_diluted_wines', 'proline'],\n",
" dtype='object')"
]
},
"execution_count": 53,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"high_corr = X_corr.loc[(X_corr.index != 'target') & (abs(X_corr['target']) > .5), X_corr.columns != 'target'].index\n",
"high_corr"
]
},
{
"cell_type": "markdown",
"id": "f0ff52e6",
"metadata": {},
"source": [
"Удалите из датафрейма X поле с целевой переменной. Для всех признаков, названия которых содержатся в списке high_corr, вычислите квадрат их значений и добавьте в датафрейм X соответствующие поля с суффиксом '_2', добавленного к первоначальному названию признака. Итоговый датафрейм должен содержать все поля, которые, были в нем изначально, а также поля с признаками из списка high_corr, возведенными в квадрат. Выведите описание полей датафрейма X с помощью метода describe."
]
},
{
"cell_type": "code",
"execution_count": 54,
"id": "1e1403ec",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>alcohol</th>\n",
" <th>malic_acid</th>\n",
" <th>ash</th>\n",
" <th>alcalinity_of_ash</th>\n",
" <th>magnesium</th>\n",
" <th>total_phenols</th>\n",
" <th>flavanoids</th>\n",
" <th>nonflavanoid_phenols</th>\n",
" <th>proanthocyanins</th>\n",
" <th>color_intensity</th>\n",
" <th>hue</th>\n",
" <th>od280/od315_of_diluted_wines</th>\n",
" <th>proline</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>14.23</td>\n",
" <td>1.71</td>\n",
" <td>2.43</td>\n",
" <td>15.6</td>\n",
" <td>127.0</td>\n",
" <td>2.80</td>\n",
" <td>3.06</td>\n",
" <td>0.28</td>\n",
" <td>2.29</td>\n",
" <td>5.64</td>\n",
" <td>1.04</td>\n",
" <td>3.92</td>\n",
" <td>1065.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>13.20</td>\n",
" <td>1.78</td>\n",
" <td>2.14</td>\n",
" <td>11.2</td>\n",
" <td>100.0</td>\n",
" <td>2.65</td>\n",
" <td>2.76</td>\n",
" <td>0.26</td>\n",
" <td>1.28</td>\n",
" <td>4.38</td>\n",
" <td>1.05</td>\n",
" <td>3.40</td>\n",
" <td>1050.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>13.16</td>\n",
" <td>2.36</td>\n",
" <td>2.67</td>\n",
" <td>18.6</td>\n",
" <td>101.0</td>\n",
" <td>2.80</td>\n",
" <td>3.24</td>\n",
" <td>0.30</td>\n",
" <td>2.81</td>\n",
" <td>5.68</td>\n",
" <td>1.03</td>\n",
" <td>3.17</td>\n",
" <td>1185.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>14.37</td>\n",
" <td>1.95</td>\n",
" <td>2.50</td>\n",
" <td>16.8</td>\n",
" <td>113.0</td>\n",
" <td>3.85</td>\n",
" <td>3.49</td>\n",
" <td>0.24</td>\n",
" <td>2.18</td>\n",
" <td>7.80</td>\n",
" <td>0.86</td>\n",
" <td>3.45</td>\n",
" <td>1480.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>13.24</td>\n",
" <td>2.59</td>\n",
" <td>2.87</td>\n",
" <td>21.0</td>\n",
" <td>118.0</td>\n",
" <td>2.80</td>\n",
" <td>2.69</td>\n",
" <td>0.39</td>\n",
" <td>1.82</td>\n",
" <td>4.32</td>\n",
" <td>1.04</td>\n",
" <td>2.93</td>\n",
" <td>735.0</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" alcohol malic_acid ash alcalinity_of_ash magnesium total_phenols \\\n",
"0 14.23 1.71 2.43 15.6 127.0 2.80 \n",
"1 13.20 1.78 2.14 11.2 100.0 2.65 \n",
"2 13.16 2.36 2.67 18.6 101.0 2.80 \n",
"3 14.37 1.95 2.50 16.8 113.0 3.85 \n",
"4 13.24 2.59 2.87 21.0 118.0 2.80 \n",
"\n",
" flavanoids nonflavanoid_phenols proanthocyanins color_intensity hue \\\n",
"0 3.06 0.28 2.29 5.64 1.04 \n",
"1 2.76 0.26 1.28 4.38 1.05 \n",
"2 3.24 0.30 2.81 5.68 1.03 \n",
"3 3.49 0.24 2.18 7.80 0.86 \n",
"4 2.69 0.39 1.82 4.32 1.04 \n",
"\n",
" od280/od315_of_diluted_wines proline \n",
"0 3.92 1065.0 \n",
"1 3.40 1050.0 \n",
"2 3.17 1185.0 \n",
"3 3.45 1480.0 \n",
"4 2.93 735.0 "
]
},
"execution_count": 54,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"X = X.drop('target', axis=1)\n",
"X.head()"
]
},
{
"cell_type": "code",
"execution_count": 56,
"id": "74173e8d",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>alcohol</th>\n",
" <th>malic_acid</th>\n",
" <th>ash</th>\n",
" <th>alcalinity_of_ash</th>\n",
" <th>magnesium</th>\n",
" <th>total_phenols</th>\n",
" <th>flavanoids</th>\n",
" <th>nonflavanoid_phenols</th>\n",
" <th>proanthocyanins</th>\n",
" <th>color_intensity</th>\n",
" <th>hue</th>\n",
" <th>od280/od315_of_diluted_wines</th>\n",
" <th>proline</th>\n",
" <th>alcalinity_of_ash_2</th>\n",
" <th>total_phenols_2</th>\n",
" <th>flavanoids_2</th>\n",
" <th>hue_2</th>\n",
" <th>od280/od315_of_diluted_wines_2</th>\n",
" <th>proline_2</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>14.23</td>\n",
" <td>1.71</td>\n",
" <td>2.43</td>\n",
" <td>15.6</td>\n",
" <td>127.0</td>\n",
" <td>2.80</td>\n",
" <td>3.06</td>\n",
" <td>0.28</td>\n",
" <td>2.29</td>\n",
" <td>5.64</td>\n",
" <td>1.04</td>\n",
" <td>3.92</td>\n",
" <td>1065.0</td>\n",
" <td>243.36</td>\n",
" <td>7.8400</td>\n",
" <td>9.3636</td>\n",
" <td>1.0816</td>\n",
" <td>15.3664</td>\n",
" <td>1134225.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>13.20</td>\n",
" <td>1.78</td>\n",
" <td>2.14</td>\n",
" <td>11.2</td>\n",
" <td>100.0</td>\n",
" <td>2.65</td>\n",
" <td>2.76</td>\n",
" <td>0.26</td>\n",
" <td>1.28</td>\n",
" <td>4.38</td>\n",
" <td>1.05</td>\n",
" <td>3.40</td>\n",
" <td>1050.0</td>\n",
" <td>125.44</td>\n",
" <td>7.0225</td>\n",
" <td>7.6176</td>\n",
" <td>1.1025</td>\n",
" <td>11.5600</td>\n",
" <td>1102500.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>13.16</td>\n",
" <td>2.36</td>\n",
" <td>2.67</td>\n",
" <td>18.6</td>\n",
" <td>101.0</td>\n",
" <td>2.80</td>\n",
" <td>3.24</td>\n",
" <td>0.30</td>\n",
" <td>2.81</td>\n",
" <td>5.68</td>\n",
" <td>1.03</td>\n",
" <td>3.17</td>\n",
" <td>1185.0</td>\n",
" <td>345.96</td>\n",
" <td>7.8400</td>\n",
" <td>10.4976</td>\n",
" <td>1.0609</td>\n",
" <td>10.0489</td>\n",
" <td>1404225.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>14.37</td>\n",
" <td>1.95</td>\n",
" <td>2.50</td>\n",
" <td>16.8</td>\n",
" <td>113.0</td>\n",
" <td>3.85</td>\n",
" <td>3.49</td>\n",
" <td>0.24</td>\n",
" <td>2.18</td>\n",
" <td>7.80</td>\n",
" <td>0.86</td>\n",
" <td>3.45</td>\n",
" <td>1480.0</td>\n",
" <td>282.24</td>\n",
" <td>14.8225</td>\n",
" <td>12.1801</td>\n",
" <td>0.7396</td>\n",
" <td>11.9025</td>\n",
" <td>2190400.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>13.24</td>\n",
" <td>2.59</td>\n",
" <td>2.87</td>\n",
" <td>21.0</td>\n",
" <td>118.0</td>\n",
" <td>2.80</td>\n",
" <td>2.69</td>\n",
" <td>0.39</td>\n",
" <td>1.82</td>\n",
" <td>4.32</td>\n",
" <td>1.04</td>\n",
" <td>2.93</td>\n",
" <td>735.0</td>\n",
" <td>441.00</td>\n",
" <td>7.8400</td>\n",
" <td>7.2361</td>\n",
" <td>1.0816</td>\n",
" <td>8.5849</td>\n",
" <td>540225.0</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" alcohol malic_acid ash alcalinity_of_ash magnesium total_phenols \\\n",
"0 14.23 1.71 2.43 15.6 127.0 2.80 \n",
"1 13.20 1.78 2.14 11.2 100.0 2.65 \n",
"2 13.16 2.36 2.67 18.6 101.0 2.80 \n",
"3 14.37 1.95 2.50 16.8 113.0 3.85 \n",
"4 13.24 2.59 2.87 21.0 118.0 2.80 \n",
"\n",
" flavanoids nonflavanoid_phenols proanthocyanins color_intensity hue \\\n",
"0 3.06 0.28 2.29 5.64 1.04 \n",
"1 2.76 0.26 1.28 4.38 1.05 \n",
"2 3.24 0.30 2.81 5.68 1.03 \n",
"3 3.49 0.24 2.18 7.80 0.86 \n",
"4 2.69 0.39 1.82 4.32 1.04 \n",
"\n",
" od280/od315_of_diluted_wines proline alcalinity_of_ash_2 \\\n",
"0 3.92 1065.0 243.36 \n",
"1 3.40 1050.0 125.44 \n",
"2 3.17 1185.0 345.96 \n",
"3 3.45 1480.0 282.24 \n",
"4 2.93 735.0 441.00 \n",
"\n",
" total_phenols_2 flavanoids_2 hue_2 od280/od315_of_diluted_wines_2 \\\n",
"0 7.8400 9.3636 1.0816 15.3664 \n",
"1 7.0225 7.6176 1.1025 11.5600 \n",
"2 7.8400 10.4976 1.0609 10.0489 \n",
"3 14.8225 12.1801 0.7396 11.9025 \n",
"4 7.8400 7.2361 1.0816 8.5849 \n",
"\n",
" proline_2 \n",
"0 1134225.0 \n",
"1 1102500.0 \n",
"2 1404225.0 \n",
"3 2190400.0 \n",
"4 540225.0 "
]
},
"execution_count": 56,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"for feature_name in high_corr:\n",
" X['{0}_2'.format(feature_name)] = X[feature_name] ** 2\n",
"\n",
"X.head()"
]
},
{
"cell_type": "code",
"execution_count": 57,
"id": "190f74c0",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>alcohol</th>\n",
" <th>malic_acid</th>\n",
" <th>ash</th>\n",
" <th>alcalinity_of_ash</th>\n",
" <th>magnesium</th>\n",
" <th>total_phenols</th>\n",
" <th>flavanoids</th>\n",
" <th>nonflavanoid_phenols</th>\n",
" <th>proanthocyanins</th>\n",
" <th>color_intensity</th>\n",
" <th>hue</th>\n",
" <th>od280/od315_of_diluted_wines</th>\n",
" <th>proline</th>\n",
" <th>alcalinity_of_ash_2</th>\n",
" <th>total_phenols_2</th>\n",
" <th>flavanoids_2</th>\n",
" <th>hue_2</th>\n",
" <th>od280/od315_of_diluted_wines_2</th>\n",
" <th>proline_2</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>count</th>\n",
" <td>178.000000</td>\n",
" <td>178.000000</td>\n",
" <td>178.000000</td>\n",
" <td>178.000000</td>\n",
" <td>178.000000</td>\n",
" <td>178.000000</td>\n",
" <td>178.000000</td>\n",
" <td>178.000000</td>\n",
" <td>178.000000</td>\n",
" <td>178.000000</td>\n",
" <td>178.000000</td>\n",
" <td>178.000000</td>\n",
" <td>178.000000</td>\n",
" <td>178.000000</td>\n",
" <td>178.000000</td>\n",
" <td>178.000000</td>\n",
" <td>178.000000</td>\n",
" <td>178.000000</td>\n",
" <td>1.780000e+02</td>\n",
" </tr>\n",
" <tr>\n",
" <th>mean</th>\n",
" <td>13.000618</td>\n",
" <td>2.336348</td>\n",
" <td>2.366517</td>\n",
" <td>19.494944</td>\n",
" <td>99.741573</td>\n",
" <td>2.295112</td>\n",
" <td>2.029270</td>\n",
" <td>0.361854</td>\n",
" <td>1.590899</td>\n",
" <td>5.058090</td>\n",
" <td>0.957449</td>\n",
" <td>2.611685</td>\n",
" <td>746.893258</td>\n",
" <td>391.142865</td>\n",
" <td>5.657030</td>\n",
" <td>5.110049</td>\n",
" <td>0.968661</td>\n",
" <td>7.322155</td>\n",
" <td>6.564591e+05</td>\n",
" </tr>\n",
" <tr>\n",
" <th>std</th>\n",
" <td>0.811827</td>\n",
" <td>1.117146</td>\n",
" <td>0.274344</td>\n",
" <td>3.339564</td>\n",
" <td>14.282484</td>\n",
" <td>0.625851</td>\n",
" <td>0.998859</td>\n",
" <td>0.124453</td>\n",
" <td>0.572359</td>\n",
" <td>2.318286</td>\n",
" <td>0.228572</td>\n",
" <td>0.709990</td>\n",
" <td>314.907474</td>\n",
" <td>133.671775</td>\n",
" <td>2.936294</td>\n",
" <td>4.211441</td>\n",
" <td>0.443798</td>\n",
" <td>3.584316</td>\n",
" <td>5.558591e+05</td>\n",
" </tr>\n",
" <tr>\n",
" <th>min</th>\n",
" <td>11.030000</td>\n",
" <td>0.740000</td>\n",
" <td>1.360000</td>\n",
" <td>10.600000</td>\n",
" <td>70.000000</td>\n",
" <td>0.980000</td>\n",
" <td>0.340000</td>\n",
" <td>0.130000</td>\n",
" <td>0.410000</td>\n",
" <td>1.280000</td>\n",
" <td>0.480000</td>\n",
" <td>1.270000</td>\n",
" <td>278.000000</td>\n",
" <td>112.360000</td>\n",
" <td>0.960400</td>\n",
" <td>0.115600</td>\n",
" <td>0.230400</td>\n",
" <td>1.612900</td>\n",
" <td>7.728400e+04</td>\n",
" </tr>\n",
" <tr>\n",
" <th>25%</th>\n",
" <td>12.362500</td>\n",
" <td>1.602500</td>\n",
" <td>2.210000</td>\n",
" <td>17.200000</td>\n",
" <td>88.000000</td>\n",
" <td>1.742500</td>\n",
" <td>1.205000</td>\n",
" <td>0.270000</td>\n",
" <td>1.250000</td>\n",
" <td>3.220000</td>\n",
" <td>0.782500</td>\n",
" <td>1.937500</td>\n",
" <td>500.500000</td>\n",
" <td>295.840000</td>\n",
" <td>3.036325</td>\n",
" <td>1.452100</td>\n",
" <td>0.612325</td>\n",
" <td>3.754075</td>\n",
" <td>2.505010e+05</td>\n",
" </tr>\n",
" <tr>\n",
" <th>50%</th>\n",
" <td>13.050000</td>\n",
" <td>1.865000</td>\n",
" <td>2.360000</td>\n",
" <td>19.500000</td>\n",
" <td>98.000000</td>\n",
" <td>2.355000</td>\n",
" <td>2.135000</td>\n",
" <td>0.340000</td>\n",
" <td>1.555000</td>\n",
" <td>4.690000</td>\n",
" <td>0.965000</td>\n",
" <td>2.780000</td>\n",
" <td>673.500000</td>\n",
" <td>380.250000</td>\n",
" <td>5.546050</td>\n",
" <td>4.558250</td>\n",
" <td>0.931250</td>\n",
" <td>7.728400</td>\n",
" <td>4.536045e+05</td>\n",
" </tr>\n",
" <tr>\n",
" <th>75%</th>\n",
" <td>13.677500</td>\n",
" <td>3.082500</td>\n",
" <td>2.557500</td>\n",
" <td>21.500000</td>\n",
" <td>107.000000</td>\n",
" <td>2.800000</td>\n",
" <td>2.875000</td>\n",
" <td>0.437500</td>\n",
" <td>1.950000</td>\n",
" <td>6.200000</td>\n",
" <td>1.120000</td>\n",
" <td>3.170000</td>\n",
" <td>985.000000</td>\n",
" <td>462.250000</td>\n",
" <td>7.840000</td>\n",
" <td>8.265700</td>\n",
" <td>1.254400</td>\n",
" <td>10.048900</td>\n",
" <td>9.702250e+05</td>\n",
" </tr>\n",
" <tr>\n",
" <th>max</th>\n",
" <td>14.830000</td>\n",
" <td>5.800000</td>\n",
" <td>3.230000</td>\n",
" <td>30.000000</td>\n",
" <td>162.000000</td>\n",
" <td>3.880000</td>\n",
" <td>5.080000</td>\n",
" <td>0.660000</td>\n",
" <td>3.580000</td>\n",
" <td>13.000000</td>\n",
" <td>1.710000</td>\n",
" <td>4.000000</td>\n",
" <td>1680.000000</td>\n",
" <td>900.000000</td>\n",
" <td>15.054400</td>\n",
" <td>25.806400</td>\n",
" <td>2.924100</td>\n",
" <td>16.000000</td>\n",
" <td>2.822400e+06</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" alcohol malic_acid ash alcalinity_of_ash magnesium \\\n",
"count 178.000000 178.000000 178.000000 178.000000 178.000000 \n",
"mean 13.000618 2.336348 2.366517 19.494944 99.741573 \n",
"std 0.811827 1.117146 0.274344 3.339564 14.282484 \n",
"min 11.030000 0.740000 1.360000 10.600000 70.000000 \n",
"25% 12.362500 1.602500 2.210000 17.200000 88.000000 \n",
"50% 13.050000 1.865000 2.360000 19.500000 98.000000 \n",
"75% 13.677500 3.082500 2.557500 21.500000 107.000000 \n",
"max 14.830000 5.800000 3.230000 30.000000 162.000000 \n",
"\n",
" total_phenols flavanoids nonflavanoid_phenols proanthocyanins \\\n",
"count 178.000000 178.000000 178.000000 178.000000 \n",
"mean 2.295112 2.029270 0.361854 1.590899 \n",
"std 0.625851 0.998859 0.124453 0.572359 \n",
"min 0.980000 0.340000 0.130000 0.410000 \n",
"25% 1.742500 1.205000 0.270000 1.250000 \n",
"50% 2.355000 2.135000 0.340000 1.555000 \n",
"75% 2.800000 2.875000 0.437500 1.950000 \n",
"max 3.880000 5.080000 0.660000 3.580000 \n",
"\n",
" color_intensity hue od280/od315_of_diluted_wines proline \\\n",
"count 178.000000 178.000000 178.000000 178.000000 \n",
"mean 5.058090 0.957449 2.611685 746.893258 \n",
"std 2.318286 0.228572 0.709990 314.907474 \n",
"min 1.280000 0.480000 1.270000 278.000000 \n",
"25% 3.220000 0.782500 1.937500 500.500000 \n",
"50% 4.690000 0.965000 2.780000 673.500000 \n",
"75% 6.200000 1.120000 3.170000 985.000000 \n",
"max 13.000000 1.710000 4.000000 1680.000000 \n",
"\n",
" alcalinity_of_ash_2 total_phenols_2 flavanoids_2 hue_2 \\\n",
"count 178.000000 178.000000 178.000000 178.000000 \n",
"mean 391.142865 5.657030 5.110049 0.968661 \n",
"std 133.671775 2.936294 4.211441 0.443798 \n",
"min 112.360000 0.960400 0.115600 0.230400 \n",
"25% 295.840000 3.036325 1.452100 0.612325 \n",
"50% 380.250000 5.546050 4.558250 0.931250 \n",
"75% 462.250000 7.840000 8.265700 1.254400 \n",
"max 900.000000 15.054400 25.806400 2.924100 \n",
"\n",
" od280/od315_of_diluted_wines_2 proline_2 \n",
"count 178.000000 1.780000e+02 \n",
"mean 7.322155 6.564591e+05 \n",
"std 3.584316 5.558591e+05 \n",
"min 1.612900 7.728400e+04 \n",
"25% 3.754075 2.505010e+05 \n",
"50% 7.728400 4.536045e+05 \n",
"75% 10.048900 9.702250e+05 \n",
"max 16.000000 2.822400e+06 "
]
},
"execution_count": 57,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"X.describe()"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "97f44af7",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.12"
}
},
"nbformat": 4,
"nbformat_minor": 5
}