mirror of
https://github.com/denis-on/ph_ny_mat_sci.git
synced 2025-12-06 10:29:27 +03:00
3983 lines
135 KiB
Plaintext
3983 lines
135 KiB
Plaintext
{
|
||
"cells": [
|
||
{
|
||
"cell_type": "markdown",
|
||
"id": "e4f5991e",
|
||
"metadata": {},
|
||
"source": [
|
||
"# Тема “Обучение с учителем”"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"id": "2442aab9",
|
||
"metadata": {},
|
||
"source": [
|
||
"## Задание 1\n",
|
||
"Импортируйте библиотеки pandas и numpy.\n",
|
||
"\n",
|
||
"Загрузите \"Boston House Prices dataset\" из встроенных наборов данных библиотеки sklearn. Создайте датафреймы X и y из этих данных.\n",
|
||
"\n",
|
||
"Разбейте эти датафреймы на тренировочные (X_train, y_train) и тестовые (X_test, y_test) с помощью функции train_test_split так, чтобы размер тестовой выборки составлял 30% от всех данных, при этом аргумент random_state должен быть равен 42.\n",
|
||
"\n",
|
||
"Создайте модель линейной регрессии под названием lr с помощью класса LinearRegression из модуля sklearn.linear_model.\n",
|
||
"\n",
|
||
"Обучите модель на тренировочных данных (используйте все признаки) и сделайте предсказание на тестовых.\n",
|
||
"\n",
|
||
"Вычислите R2 полученных предказаний с помощью r2_score из модуля sklearn.metrics."
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 1,
|
||
"id": "f79ac751",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"import pandas as pd\n",
|
||
"import numpy as np"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 2,
|
||
"id": "483c687f",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"from sklearn.datasets import load_boston"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 3,
|
||
"id": "fd8d693f",
|
||
"metadata": {
|
||
"scrolled": true
|
||
},
|
||
"outputs": [],
|
||
"source": [
|
||
"import warnings\n",
|
||
"from sklearn.datasets import load_boston\n",
|
||
"with warnings.catch_warnings():\n",
|
||
" # You should probably not use this dataset.\n",
|
||
" warnings.filterwarnings(\"ignore\")\n",
|
||
" boston = load_boston()\n",
|
||
"data = boston[\"data\"]"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 4,
|
||
"id": "81339e3e",
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/html": [
|
||
"<div>\n",
|
||
"<style scoped>\n",
|
||
" .dataframe tbody tr th:only-of-type {\n",
|
||
" vertical-align: middle;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe tbody tr th {\n",
|
||
" vertical-align: top;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe thead th {\n",
|
||
" text-align: right;\n",
|
||
" }\n",
|
||
"</style>\n",
|
||
"<table border=\"1\" class=\"dataframe\">\n",
|
||
" <thead>\n",
|
||
" <tr style=\"text-align: right;\">\n",
|
||
" <th></th>\n",
|
||
" <th>CRIM</th>\n",
|
||
" <th>ZN</th>\n",
|
||
" <th>INDUS</th>\n",
|
||
" <th>CHAS</th>\n",
|
||
" <th>NOX</th>\n",
|
||
" <th>RM</th>\n",
|
||
" <th>AGE</th>\n",
|
||
" <th>DIS</th>\n",
|
||
" <th>RAD</th>\n",
|
||
" <th>TAX</th>\n",
|
||
" <th>PTRATIO</th>\n",
|
||
" <th>B</th>\n",
|
||
" <th>LSTAT</th>\n",
|
||
" </tr>\n",
|
||
" </thead>\n",
|
||
" <tbody>\n",
|
||
" <tr>\n",
|
||
" <th>0</th>\n",
|
||
" <td>0.00632</td>\n",
|
||
" <td>18.0</td>\n",
|
||
" <td>2.31</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.538</td>\n",
|
||
" <td>6.575</td>\n",
|
||
" <td>65.2</td>\n",
|
||
" <td>4.0900</td>\n",
|
||
" <td>1.0</td>\n",
|
||
" <td>296.0</td>\n",
|
||
" <td>15.3</td>\n",
|
||
" <td>396.90</td>\n",
|
||
" <td>4.98</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>1</th>\n",
|
||
" <td>0.02731</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>7.07</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.469</td>\n",
|
||
" <td>6.421</td>\n",
|
||
" <td>78.9</td>\n",
|
||
" <td>4.9671</td>\n",
|
||
" <td>2.0</td>\n",
|
||
" <td>242.0</td>\n",
|
||
" <td>17.8</td>\n",
|
||
" <td>396.90</td>\n",
|
||
" <td>9.14</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>2</th>\n",
|
||
" <td>0.02729</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>7.07</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.469</td>\n",
|
||
" <td>7.185</td>\n",
|
||
" <td>61.1</td>\n",
|
||
" <td>4.9671</td>\n",
|
||
" <td>2.0</td>\n",
|
||
" <td>242.0</td>\n",
|
||
" <td>17.8</td>\n",
|
||
" <td>392.83</td>\n",
|
||
" <td>4.03</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>3</th>\n",
|
||
" <td>0.03237</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>2.18</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.458</td>\n",
|
||
" <td>6.998</td>\n",
|
||
" <td>45.8</td>\n",
|
||
" <td>6.0622</td>\n",
|
||
" <td>3.0</td>\n",
|
||
" <td>222.0</td>\n",
|
||
" <td>18.7</td>\n",
|
||
" <td>394.63</td>\n",
|
||
" <td>2.94</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>4</th>\n",
|
||
" <td>0.06905</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>2.18</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.458</td>\n",
|
||
" <td>7.147</td>\n",
|
||
" <td>54.2</td>\n",
|
||
" <td>6.0622</td>\n",
|
||
" <td>3.0</td>\n",
|
||
" <td>222.0</td>\n",
|
||
" <td>18.7</td>\n",
|
||
" <td>396.90</td>\n",
|
||
" <td>5.33</td>\n",
|
||
" </tr>\n",
|
||
" </tbody>\n",
|
||
"</table>\n",
|
||
"</div>"
|
||
],
|
||
"text/plain": [
|
||
" CRIM ZN INDUS CHAS NOX RM AGE DIS RAD TAX \\\n",
|
||
"0 0.00632 18.0 2.31 0.0 0.538 6.575 65.2 4.0900 1.0 296.0 \n",
|
||
"1 0.02731 0.0 7.07 0.0 0.469 6.421 78.9 4.9671 2.0 242.0 \n",
|
||
"2 0.02729 0.0 7.07 0.0 0.469 7.185 61.1 4.9671 2.0 242.0 \n",
|
||
"3 0.03237 0.0 2.18 0.0 0.458 6.998 45.8 6.0622 3.0 222.0 \n",
|
||
"4 0.06905 0.0 2.18 0.0 0.458 7.147 54.2 6.0622 3.0 222.0 \n",
|
||
"\n",
|
||
" PTRATIO B LSTAT \n",
|
||
"0 15.3 396.90 4.98 \n",
|
||
"1 17.8 396.90 9.14 \n",
|
||
"2 17.8 392.83 4.03 \n",
|
||
"3 18.7 394.63 2.94 \n",
|
||
"4 18.7 396.90 5.33 "
|
||
]
|
||
},
|
||
"execution_count": 4,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"feature_names = boston[\"feature_names\"]\n",
|
||
"\n",
|
||
"X = pd.DataFrame(data, columns=feature_names)\n",
|
||
"X.head()"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 5,
|
||
"id": "0a3b3fbd",
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/html": [
|
||
"<div>\n",
|
||
"<style scoped>\n",
|
||
" .dataframe tbody tr th:only-of-type {\n",
|
||
" vertical-align: middle;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe tbody tr th {\n",
|
||
" vertical-align: top;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe thead th {\n",
|
||
" text-align: right;\n",
|
||
" }\n",
|
||
"</style>\n",
|
||
"<table border=\"1\" class=\"dataframe\">\n",
|
||
" <thead>\n",
|
||
" <tr style=\"text-align: right;\">\n",
|
||
" <th></th>\n",
|
||
" <th>price</th>\n",
|
||
" </tr>\n",
|
||
" </thead>\n",
|
||
" <tbody>\n",
|
||
" <tr>\n",
|
||
" <th>0</th>\n",
|
||
" <td>24.0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>1</th>\n",
|
||
" <td>21.6</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>2</th>\n",
|
||
" <td>34.7</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>3</th>\n",
|
||
" <td>33.4</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>4</th>\n",
|
||
" <td>36.2</td>\n",
|
||
" </tr>\n",
|
||
" </tbody>\n",
|
||
"</table>\n",
|
||
"</div>"
|
||
],
|
||
"text/plain": [
|
||
" price\n",
|
||
"0 24.0\n",
|
||
"1 21.6\n",
|
||
"2 34.7\n",
|
||
"3 33.4\n",
|
||
"4 36.2"
|
||
]
|
||
},
|
||
"execution_count": 5,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"target = boston[\"target\"]\n",
|
||
"\n",
|
||
"Y = pd.DataFrame(target, columns=[\"price\"])\n",
|
||
"Y.head()"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 6,
|
||
"id": "81f5f72a",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"from sklearn.model_selection import train_test_split"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 7,
|
||
"id": "eca2e802",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.30, random_state=42)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 8,
|
||
"id": "b7fdd109",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"from sklearn.linear_model import LinearRegression"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 9,
|
||
"id": "87bbc227",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"lr = LinearRegression()"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 10,
|
||
"id": "41af6442",
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/plain": [
|
||
"LinearRegression()"
|
||
]
|
||
},
|
||
"execution_count": 10,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"lr.fit(X_train, Y_train)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 11,
|
||
"id": "28a67c09",
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/html": [
|
||
"<div>\n",
|
||
"<style scoped>\n",
|
||
" .dataframe tbody tr th:only-of-type {\n",
|
||
" vertical-align: middle;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe tbody tr th {\n",
|
||
" vertical-align: top;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe thead th {\n",
|
||
" text-align: right;\n",
|
||
" }\n",
|
||
"</style>\n",
|
||
"<table border=\"1\" class=\"dataframe\">\n",
|
||
" <thead>\n",
|
||
" <tr style=\"text-align: right;\">\n",
|
||
" <th></th>\n",
|
||
" <th>Y_test</th>\n",
|
||
" <th>Y_pred_lr</th>\n",
|
||
" </tr>\n",
|
||
" </thead>\n",
|
||
" <tbody>\n",
|
||
" <tr>\n",
|
||
" <th>173</th>\n",
|
||
" <td>23.6</td>\n",
|
||
" <td>28.648960</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>274</th>\n",
|
||
" <td>32.4</td>\n",
|
||
" <td>36.495014</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>491</th>\n",
|
||
" <td>13.6</td>\n",
|
||
" <td>15.411193</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>72</th>\n",
|
||
" <td>22.8</td>\n",
|
||
" <td>25.403213</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>452</th>\n",
|
||
" <td>16.1</td>\n",
|
||
" <td>18.855280</td>\n",
|
||
" </tr>\n",
|
||
" </tbody>\n",
|
||
"</table>\n",
|
||
"</div>"
|
||
],
|
||
"text/plain": [
|
||
" Y_test Y_pred_lr\n",
|
||
"173 23.6 28.648960\n",
|
||
"274 32.4 36.495014\n",
|
||
"491 13.6 15.411193\n",
|
||
"72 22.8 25.403213\n",
|
||
"452 16.1 18.855280"
|
||
]
|
||
},
|
||
"execution_count": 11,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"y_pred_lr = lr.predict(X_test)\n",
|
||
"check_test_lr = pd.DataFrame({\n",
|
||
" \"Y_test\": Y_test[\"price\"], \n",
|
||
" \"Y_pred_lr\": y_pred_lr.flatten()})\n",
|
||
"\n",
|
||
"check_test_lr.head()"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 12,
|
||
"id": "4a035a94",
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"21.517444231176995\n"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"from sklearn.metrics import mean_squared_error\n",
|
||
"\n",
|
||
"mean_squared_error_lr = mean_squared_error(check_test_lr[\"Y_pred_lr\"], check_test_lr[\"Y_test\"])\n",
|
||
"print(mean_squared_error_lr)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 13,
|
||
"id": "2b87195e",
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/plain": [
|
||
"0.711226005748496"
|
||
]
|
||
},
|
||
"execution_count": 13,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"from sklearn.metrics import r2_score\n",
|
||
"\n",
|
||
"r2_score(Y_test, y_pred_lr)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"id": "bbf6dc90",
|
||
"metadata": {},
|
||
"source": [
|
||
"## Задание 2\n",
|
||
"\n",
|
||
"Создайте модель под названием model с помощью RandomForestRegressor из модуля sklearn.ensemble.\n",
|
||
"\n",
|
||
"Сделайте агрумент n_estimators равным 1000, max_depth должен быть равен 12 и random_state сделайте равным 42.\n",
|
||
"\n",
|
||
"Обучите модель на тренировочных данных аналогично тому, как вы обучали модель LinearRegression,\n",
|
||
"но при этом в метод fit вместо датафрейма y_train поставьте y_train.values[:, 0],\n",
|
||
"чтобы получить из датафрейма одномерный массив Numpy,\n",
|
||
"так как для класса RandomForestRegressor в данном методе для аргумента y предпочтительно применение массивов вместо датафрейма.\n",
|
||
"\n",
|
||
"Сделайте предсказание на тестовых данных и посчитайте R2.\n",
|
||
"\n",
|
||
"Сравните с результатом из предыдущего задания. Напишите в комментариях к коду, какая модель в данном случае работает лучше."
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 14,
|
||
"id": "f8f381fd",
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/plain": [
|
||
"RandomForestRegressor(max_depth=12, n_estimators=1000, random_state=42)"
|
||
]
|
||
},
|
||
"execution_count": 14,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"from sklearn.ensemble import RandomForestRegressor\n",
|
||
"model = RandomForestRegressor(n_estimators=1000, max_depth=12, random_state=42)\n",
|
||
"model.fit(X_train, Y_train.values[:, 0])"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 15,
|
||
"id": "c2733e21",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"y_pred_1 = model.predict(X_test)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 16,
|
||
"id": "ff72edb4",
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/plain": [
|
||
"0.87472606157312"
|
||
]
|
||
},
|
||
"execution_count": 16,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"r2_score(Y_test, y_pred_1)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 17,
|
||
"id": "24d68924",
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/html": [
|
||
"<div>\n",
|
||
"<style scoped>\n",
|
||
" .dataframe tbody tr th:only-of-type {\n",
|
||
" vertical-align: middle;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe tbody tr th {\n",
|
||
" vertical-align: top;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe thead th {\n",
|
||
" text-align: right;\n",
|
||
" }\n",
|
||
"</style>\n",
|
||
"<table border=\"1\" class=\"dataframe\">\n",
|
||
" <thead>\n",
|
||
" <tr style=\"text-align: right;\">\n",
|
||
" <th></th>\n",
|
||
" <th>y_test</th>\n",
|
||
" <th>y_pred_lr</th>\n",
|
||
" <th>y_pred_rf</th>\n",
|
||
" </tr>\n",
|
||
" </thead>\n",
|
||
" <tbody>\n",
|
||
" <tr>\n",
|
||
" <th>173</th>\n",
|
||
" <td>23.6</td>\n",
|
||
" <td>28.648960</td>\n",
|
||
" <td>22.806412</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>274</th>\n",
|
||
" <td>32.4</td>\n",
|
||
" <td>36.495014</td>\n",
|
||
" <td>31.131464</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>491</th>\n",
|
||
" <td>13.6</td>\n",
|
||
" <td>15.411193</td>\n",
|
||
" <td>16.339125</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>72</th>\n",
|
||
" <td>22.8</td>\n",
|
||
" <td>25.403213</td>\n",
|
||
" <td>23.810726</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>452</th>\n",
|
||
" <td>16.1</td>\n",
|
||
" <td>18.855280</td>\n",
|
||
" <td>17.139521</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>76</th>\n",
|
||
" <td>20.0</td>\n",
|
||
" <td>23.146689</td>\n",
|
||
" <td>21.832284</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>316</th>\n",
|
||
" <td>17.8</td>\n",
|
||
" <td>17.392124</td>\n",
|
||
" <td>19.895747</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>140</th>\n",
|
||
" <td>14.0</td>\n",
|
||
" <td>14.078599</td>\n",
|
||
" <td>14.754118</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>471</th>\n",
|
||
" <td>19.6</td>\n",
|
||
" <td>23.036927</td>\n",
|
||
" <td>21.240835</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>500</th>\n",
|
||
" <td>16.8</td>\n",
|
||
" <td>20.599433</td>\n",
|
||
" <td>20.898658</td>\n",
|
||
" </tr>\n",
|
||
" </tbody>\n",
|
||
"</table>\n",
|
||
"</div>"
|
||
],
|
||
"text/plain": [
|
||
" y_test y_pred_lr y_pred_rf\n",
|
||
"173 23.6 28.648960 22.806412\n",
|
||
"274 32.4 36.495014 31.131464\n",
|
||
"491 13.6 15.411193 16.339125\n",
|
||
"72 22.8 25.403213 23.810726\n",
|
||
"452 16.1 18.855280 17.139521\n",
|
||
"76 20.0 23.146689 21.832284\n",
|
||
"316 17.8 17.392124 19.895747\n",
|
||
"140 14.0 14.078599 14.754118\n",
|
||
"471 19.6 23.036927 21.240835\n",
|
||
"500 16.8 20.599433 20.898658"
|
||
]
|
||
},
|
||
"execution_count": 17,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"check_test = pd.DataFrame({\n",
|
||
" \"y_test\": Y_test[\"price\"],\n",
|
||
" \"y_pred_lr\": y_pred_lr.flatten(),\n",
|
||
" \"y_pred_rf\": y_pred_1.flatten(),\n",
|
||
"})\n",
|
||
"\n",
|
||
"check_test.head(10)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"id": "9f09783f",
|
||
"metadata": {},
|
||
"source": [
|
||
"R2 из первого задания меньше чем R2 во втором задании, а значить у модели построеной с помощью RandomForestRegressor предсказания ближе к тестовым."
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"id": "a86c9368",
|
||
"metadata": {},
|
||
"source": [
|
||
"## *Задание 3\n",
|
||
"Вызовите документацию для класса RandomForestRegressor, найдите информацию об атрибуте feature_importances_.\n",
|
||
"\n",
|
||
"С помощью этого атрибута найдите сумму всех показателей важности, установите, какие два признака показывают наибольшую важность."
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 18,
|
||
"id": "8acc1978",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"?RandomForestRegressor"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 19,
|
||
"id": "cd674bb4",
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"[0.03167574 0.00154252 0.00713813 0.00123624 0.01426897 0.40268179\n",
|
||
" 0.01429864 0.06397257 0.00528122 0.01152493 0.01808108 0.01245085\n",
|
||
" 0.41584732]\n"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"print(model.feature_importances_)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 20,
|
||
"id": "1e1dbef5",
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/html": [
|
||
"<div>\n",
|
||
"<style scoped>\n",
|
||
" .dataframe tbody tr th:only-of-type {\n",
|
||
" vertical-align: middle;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe tbody tr th {\n",
|
||
" vertical-align: top;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe thead th {\n",
|
||
" text-align: right;\n",
|
||
" }\n",
|
||
"</style>\n",
|
||
"<table border=\"1\" class=\"dataframe\">\n",
|
||
" <thead>\n",
|
||
" <tr style=\"text-align: right;\">\n",
|
||
" <th></th>\n",
|
||
" <th>feature_importance</th>\n",
|
||
" <th>name</th>\n",
|
||
" </tr>\n",
|
||
" </thead>\n",
|
||
" <tbody>\n",
|
||
" <tr>\n",
|
||
" <th>0</th>\n",
|
||
" <td>0.031676</td>\n",
|
||
" <td>CRIM</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>1</th>\n",
|
||
" <td>0.001543</td>\n",
|
||
" <td>ZN</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>2</th>\n",
|
||
" <td>0.007138</td>\n",
|
||
" <td>INDUS</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>3</th>\n",
|
||
" <td>0.001236</td>\n",
|
||
" <td>CHAS</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>4</th>\n",
|
||
" <td>0.014269</td>\n",
|
||
" <td>NOX</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>5</th>\n",
|
||
" <td>0.402682</td>\n",
|
||
" <td>RM</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>6</th>\n",
|
||
" <td>0.014299</td>\n",
|
||
" <td>AGE</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>7</th>\n",
|
||
" <td>0.063973</td>\n",
|
||
" <td>DIS</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>8</th>\n",
|
||
" <td>0.005281</td>\n",
|
||
" <td>RAD</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>9</th>\n",
|
||
" <td>0.011525</td>\n",
|
||
" <td>TAX</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>10</th>\n",
|
||
" <td>0.018081</td>\n",
|
||
" <td>PTRATIO</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>11</th>\n",
|
||
" <td>0.012451</td>\n",
|
||
" <td>B</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>12</th>\n",
|
||
" <td>0.415847</td>\n",
|
||
" <td>LSTAT</td>\n",
|
||
" </tr>\n",
|
||
" </tbody>\n",
|
||
"</table>\n",
|
||
"</div>"
|
||
],
|
||
"text/plain": [
|
||
" feature_importance name\n",
|
||
"0 0.031676 CRIM\n",
|
||
"1 0.001543 ZN\n",
|
||
"2 0.007138 INDUS\n",
|
||
"3 0.001236 CHAS\n",
|
||
"4 0.014269 NOX\n",
|
||
"5 0.402682 RM\n",
|
||
"6 0.014299 AGE\n",
|
||
"7 0.063973 DIS\n",
|
||
"8 0.005281 RAD\n",
|
||
"9 0.011525 TAX\n",
|
||
"10 0.018081 PTRATIO\n",
|
||
"11 0.012451 B\n",
|
||
"12 0.415847 LSTAT"
|
||
]
|
||
},
|
||
"execution_count": 20,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"feature_importance = pd.DataFrame({'name':X.columns, \n",
|
||
" 'feature_importance':model.feature_importances_}, \n",
|
||
" columns=['feature_importance', 'name'])\n",
|
||
"feature_importance"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"id": "3bf5bd9e",
|
||
"metadata": {},
|
||
"source": [
|
||
"Два признака показываюoие наибольшую важность:"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 21,
|
||
"id": "82439470",
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/html": [
|
||
"<div>\n",
|
||
"<style scoped>\n",
|
||
" .dataframe tbody tr th:only-of-type {\n",
|
||
" vertical-align: middle;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe tbody tr th {\n",
|
||
" vertical-align: top;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe thead th {\n",
|
||
" text-align: right;\n",
|
||
" }\n",
|
||
"</style>\n",
|
||
"<table border=\"1\" class=\"dataframe\">\n",
|
||
" <thead>\n",
|
||
" <tr style=\"text-align: right;\">\n",
|
||
" <th></th>\n",
|
||
" <th>feature_importance</th>\n",
|
||
" <th>name</th>\n",
|
||
" </tr>\n",
|
||
" </thead>\n",
|
||
" <tbody>\n",
|
||
" <tr>\n",
|
||
" <th>12</th>\n",
|
||
" <td>0.415847</td>\n",
|
||
" <td>LSTAT</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>5</th>\n",
|
||
" <td>0.402682</td>\n",
|
||
" <td>RM</td>\n",
|
||
" </tr>\n",
|
||
" </tbody>\n",
|
||
"</table>\n",
|
||
"</div>"
|
||
],
|
||
"text/plain": [
|
||
" feature_importance name\n",
|
||
"12 0.415847 LSTAT\n",
|
||
"5 0.402682 RM"
|
||
]
|
||
},
|
||
"execution_count": 21,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"feature_importance.nlargest(2, 'feature_importance')"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"id": "9c507bb6",
|
||
"metadata": {},
|
||
"source": [
|
||
"Сумма показателей важности:"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 22,
|
||
"id": "2aae8d49",
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/plain": [
|
||
"1.0"
|
||
]
|
||
},
|
||
"execution_count": 22,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"model.feature_importances_.sum()"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"id": "3e9baeab",
|
||
"metadata": {},
|
||
"source": [
|
||
"## *Задание 4\n",
|
||
"\n",
|
||
"В этом задании мы будем работать с датасетом, с которым мы уже знакомы по домашнему заданию по библиотеке Matplotlib, это датасет Credit Card Fraud Detection.Для этого датасета мы будем решать задачу классификации - будем определять,какие из транзакциции по кредитной карте являются мошенническими.Данный датасет сильно несбалансирован (так как случаи мошенничества относительно редки),так что применение метрики accuracy не принесет пользы и не поможет выбрать лучшую модель.Мы будем вычислять AUC, то есть площадь под кривой ROC.\n",
|
||
"\n",
|
||
"Импортируйте из соответствующих модулей RandomForestClassifier, GridSearchCV и train_test_split.\n",
|
||
"\n",
|
||
"Загрузите датасет creditcard.csv и создайте датафрейм df.\n",
|
||
"\n",
|
||
"С помощью метода value_counts с аргументом normalize=True убедитесь в том, что выборка несбалансирована.Используя метод info, проверьте, все ли столбцы содержат числовые данные и нет ли в них пропусков. Примените следующую настройку, чтобы можно было просматривать все столбцы датафрейма: pd.options.display.max_columns = 100.\n",
|
||
"\n",
|
||
"Просмотрите первые 10 строк датафрейма df.\n",
|
||
"\n",
|
||
"Создайте датафрейм X из датафрейма df, исключив столбец Class.\n",
|
||
"\n",
|
||
"Создайте объект Series под названием y из столбца Class.\n",
|
||
"\n",
|
||
"Разбейте X и y на тренировочный и тестовый наборы данных при помощи функции train_test_split, используя аргументы: test_size=0.3, random_state=100, stratify=y. У вас должны получиться объекты X_train, X_test, y_train и y_test.\n",
|
||
"\n",
|
||
"Просмотрите информацию о их форме. Для поиска по сетке параметров задайте такие параметры: parameters = [{'n_estimators': [10, 15],'max_features': np.arange(3, 5),'max_depth': np.arange(4, 7)}]\n",
|
||
"\n",
|
||
"Создайте модель GridSearchCV со следующими аргументами: estimator=RandomForestClassifier(random_state=100), param_grid=parameters, scoring='roc_auc', cv=3.\n",
|
||
"\n",
|
||
"Обучите модель на тренировочном наборе данных (может занять несколько минут).\n",
|
||
"\n",
|
||
"Просмотрите параметры лучшей модели с помощью атрибута best_params_.\n",
|
||
"\n",
|
||
"Предскажите вероятности классов с помощью полученнной модели и метода predict_proba.\n",
|
||
"\n",
|
||
"Из полученного результата (массив Numpy) выберите столбец с индексом 1 (вероятность класса 1) и запишите в массив y_pred_proba.\n",
|
||
"\n",
|
||
"Из модуля sklearn.metrics импортируйте метрику roc_auc_score.\n",
|
||
"\n",
|
||
"Вычислите AUC на тестовых данных и сравните с результатом,полученным на тренировочных данных, используя в качестве аргументовмассивы y_test и y_pred_proba."
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 23,
|
||
"id": "c890d00a",
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/html": [
|
||
"<div>\n",
|
||
"<style scoped>\n",
|
||
" .dataframe tbody tr th:only-of-type {\n",
|
||
" vertical-align: middle;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe tbody tr th {\n",
|
||
" vertical-align: top;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe thead th {\n",
|
||
" text-align: right;\n",
|
||
" }\n",
|
||
"</style>\n",
|
||
"<table border=\"1\" class=\"dataframe\">\n",
|
||
" <thead>\n",
|
||
" <tr style=\"text-align: right;\">\n",
|
||
" <th></th>\n",
|
||
" <th>Time</th>\n",
|
||
" <th>V1</th>\n",
|
||
" <th>V2</th>\n",
|
||
" <th>V3</th>\n",
|
||
" <th>V4</th>\n",
|
||
" <th>V5</th>\n",
|
||
" <th>V6</th>\n",
|
||
" <th>V7</th>\n",
|
||
" <th>V8</th>\n",
|
||
" <th>V9</th>\n",
|
||
" <th>...</th>\n",
|
||
" <th>V21</th>\n",
|
||
" <th>V22</th>\n",
|
||
" <th>V23</th>\n",
|
||
" <th>V24</th>\n",
|
||
" <th>V25</th>\n",
|
||
" <th>V26</th>\n",
|
||
" <th>V27</th>\n",
|
||
" <th>V28</th>\n",
|
||
" <th>Amount</th>\n",
|
||
" <th>Class</th>\n",
|
||
" </tr>\n",
|
||
" </thead>\n",
|
||
" <tbody>\n",
|
||
" <tr>\n",
|
||
" <th>0</th>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>-1.359807</td>\n",
|
||
" <td>-0.072781</td>\n",
|
||
" <td>2.536347</td>\n",
|
||
" <td>1.378155</td>\n",
|
||
" <td>-0.338321</td>\n",
|
||
" <td>0.462388</td>\n",
|
||
" <td>0.239599</td>\n",
|
||
" <td>0.098698</td>\n",
|
||
" <td>0.363787</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>-0.018307</td>\n",
|
||
" <td>0.277838</td>\n",
|
||
" <td>-0.110474</td>\n",
|
||
" <td>0.066928</td>\n",
|
||
" <td>0.128539</td>\n",
|
||
" <td>-0.189115</td>\n",
|
||
" <td>0.133558</td>\n",
|
||
" <td>-0.021053</td>\n",
|
||
" <td>149.62</td>\n",
|
||
" <td>0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>1</th>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>1.191857</td>\n",
|
||
" <td>0.266151</td>\n",
|
||
" <td>0.166480</td>\n",
|
||
" <td>0.448154</td>\n",
|
||
" <td>0.060018</td>\n",
|
||
" <td>-0.082361</td>\n",
|
||
" <td>-0.078803</td>\n",
|
||
" <td>0.085102</td>\n",
|
||
" <td>-0.255425</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>-0.225775</td>\n",
|
||
" <td>-0.638672</td>\n",
|
||
" <td>0.101288</td>\n",
|
||
" <td>-0.339846</td>\n",
|
||
" <td>0.167170</td>\n",
|
||
" <td>0.125895</td>\n",
|
||
" <td>-0.008983</td>\n",
|
||
" <td>0.014724</td>\n",
|
||
" <td>2.69</td>\n",
|
||
" <td>0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>2</th>\n",
|
||
" <td>1.0</td>\n",
|
||
" <td>-1.358354</td>\n",
|
||
" <td>-1.340163</td>\n",
|
||
" <td>1.773209</td>\n",
|
||
" <td>0.379780</td>\n",
|
||
" <td>-0.503198</td>\n",
|
||
" <td>1.800499</td>\n",
|
||
" <td>0.791461</td>\n",
|
||
" <td>0.247676</td>\n",
|
||
" <td>-1.514654</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>0.247998</td>\n",
|
||
" <td>0.771679</td>\n",
|
||
" <td>0.909412</td>\n",
|
||
" <td>-0.689281</td>\n",
|
||
" <td>-0.327642</td>\n",
|
||
" <td>-0.139097</td>\n",
|
||
" <td>-0.055353</td>\n",
|
||
" <td>-0.059752</td>\n",
|
||
" <td>378.66</td>\n",
|
||
" <td>0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>3</th>\n",
|
||
" <td>1.0</td>\n",
|
||
" <td>-0.966272</td>\n",
|
||
" <td>-0.185226</td>\n",
|
||
" <td>1.792993</td>\n",
|
||
" <td>-0.863291</td>\n",
|
||
" <td>-0.010309</td>\n",
|
||
" <td>1.247203</td>\n",
|
||
" <td>0.237609</td>\n",
|
||
" <td>0.377436</td>\n",
|
||
" <td>-1.387024</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>-0.108300</td>\n",
|
||
" <td>0.005274</td>\n",
|
||
" <td>-0.190321</td>\n",
|
||
" <td>-1.175575</td>\n",
|
||
" <td>0.647376</td>\n",
|
||
" <td>-0.221929</td>\n",
|
||
" <td>0.062723</td>\n",
|
||
" <td>0.061458</td>\n",
|
||
" <td>123.50</td>\n",
|
||
" <td>0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>4</th>\n",
|
||
" <td>2.0</td>\n",
|
||
" <td>-1.158233</td>\n",
|
||
" <td>0.877737</td>\n",
|
||
" <td>1.548718</td>\n",
|
||
" <td>0.403034</td>\n",
|
||
" <td>-0.407193</td>\n",
|
||
" <td>0.095921</td>\n",
|
||
" <td>0.592941</td>\n",
|
||
" <td>-0.270533</td>\n",
|
||
" <td>0.817739</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>-0.009431</td>\n",
|
||
" <td>0.798278</td>\n",
|
||
" <td>-0.137458</td>\n",
|
||
" <td>0.141267</td>\n",
|
||
" <td>-0.206010</td>\n",
|
||
" <td>0.502292</td>\n",
|
||
" <td>0.219422</td>\n",
|
||
" <td>0.215153</td>\n",
|
||
" <td>69.99</td>\n",
|
||
" <td>0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>5</th>\n",
|
||
" <td>2.0</td>\n",
|
||
" <td>-0.425966</td>\n",
|
||
" <td>0.960523</td>\n",
|
||
" <td>1.141109</td>\n",
|
||
" <td>-0.168252</td>\n",
|
||
" <td>0.420987</td>\n",
|
||
" <td>-0.029728</td>\n",
|
||
" <td>0.476201</td>\n",
|
||
" <td>0.260314</td>\n",
|
||
" <td>-0.568671</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>-0.208254</td>\n",
|
||
" <td>-0.559825</td>\n",
|
||
" <td>-0.026398</td>\n",
|
||
" <td>-0.371427</td>\n",
|
||
" <td>-0.232794</td>\n",
|
||
" <td>0.105915</td>\n",
|
||
" <td>0.253844</td>\n",
|
||
" <td>0.081080</td>\n",
|
||
" <td>3.67</td>\n",
|
||
" <td>0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>6</th>\n",
|
||
" <td>4.0</td>\n",
|
||
" <td>1.229658</td>\n",
|
||
" <td>0.141004</td>\n",
|
||
" <td>0.045371</td>\n",
|
||
" <td>1.202613</td>\n",
|
||
" <td>0.191881</td>\n",
|
||
" <td>0.272708</td>\n",
|
||
" <td>-0.005159</td>\n",
|
||
" <td>0.081213</td>\n",
|
||
" <td>0.464960</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>-0.167716</td>\n",
|
||
" <td>-0.270710</td>\n",
|
||
" <td>-0.154104</td>\n",
|
||
" <td>-0.780055</td>\n",
|
||
" <td>0.750137</td>\n",
|
||
" <td>-0.257237</td>\n",
|
||
" <td>0.034507</td>\n",
|
||
" <td>0.005168</td>\n",
|
||
" <td>4.99</td>\n",
|
||
" <td>0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>7</th>\n",
|
||
" <td>7.0</td>\n",
|
||
" <td>-0.644269</td>\n",
|
||
" <td>1.417964</td>\n",
|
||
" <td>1.074380</td>\n",
|
||
" <td>-0.492199</td>\n",
|
||
" <td>0.948934</td>\n",
|
||
" <td>0.428118</td>\n",
|
||
" <td>1.120631</td>\n",
|
||
" <td>-3.807864</td>\n",
|
||
" <td>0.615375</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>1.943465</td>\n",
|
||
" <td>-1.015455</td>\n",
|
||
" <td>0.057504</td>\n",
|
||
" <td>-0.649709</td>\n",
|
||
" <td>-0.415267</td>\n",
|
||
" <td>-0.051634</td>\n",
|
||
" <td>-1.206921</td>\n",
|
||
" <td>-1.085339</td>\n",
|
||
" <td>40.80</td>\n",
|
||
" <td>0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>8</th>\n",
|
||
" <td>7.0</td>\n",
|
||
" <td>-0.894286</td>\n",
|
||
" <td>0.286157</td>\n",
|
||
" <td>-0.113192</td>\n",
|
||
" <td>-0.271526</td>\n",
|
||
" <td>2.669599</td>\n",
|
||
" <td>3.721818</td>\n",
|
||
" <td>0.370145</td>\n",
|
||
" <td>0.851084</td>\n",
|
||
" <td>-0.392048</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>-0.073425</td>\n",
|
||
" <td>-0.268092</td>\n",
|
||
" <td>-0.204233</td>\n",
|
||
" <td>1.011592</td>\n",
|
||
" <td>0.373205</td>\n",
|
||
" <td>-0.384157</td>\n",
|
||
" <td>0.011747</td>\n",
|
||
" <td>0.142404</td>\n",
|
||
" <td>93.20</td>\n",
|
||
" <td>0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>9</th>\n",
|
||
" <td>9.0</td>\n",
|
||
" <td>-0.338262</td>\n",
|
||
" <td>1.119593</td>\n",
|
||
" <td>1.044367</td>\n",
|
||
" <td>-0.222187</td>\n",
|
||
" <td>0.499361</td>\n",
|
||
" <td>-0.246761</td>\n",
|
||
" <td>0.651583</td>\n",
|
||
" <td>0.069539</td>\n",
|
||
" <td>-0.736727</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>-0.246914</td>\n",
|
||
" <td>-0.633753</td>\n",
|
||
" <td>-0.120794</td>\n",
|
||
" <td>-0.385050</td>\n",
|
||
" <td>-0.069733</td>\n",
|
||
" <td>0.094199</td>\n",
|
||
" <td>0.246219</td>\n",
|
||
" <td>0.083076</td>\n",
|
||
" <td>3.68</td>\n",
|
||
" <td>0</td>\n",
|
||
" </tr>\n",
|
||
" </tbody>\n",
|
||
"</table>\n",
|
||
"<p>10 rows × 31 columns</p>\n",
|
||
"</div>"
|
||
],
|
||
"text/plain": [
|
||
" Time V1 V2 V3 V4 V5 V6 V7 \\\n",
|
||
"0 0.0 -1.359807 -0.072781 2.536347 1.378155 -0.338321 0.462388 0.239599 \n",
|
||
"1 0.0 1.191857 0.266151 0.166480 0.448154 0.060018 -0.082361 -0.078803 \n",
|
||
"2 1.0 -1.358354 -1.340163 1.773209 0.379780 -0.503198 1.800499 0.791461 \n",
|
||
"3 1.0 -0.966272 -0.185226 1.792993 -0.863291 -0.010309 1.247203 0.237609 \n",
|
||
"4 2.0 -1.158233 0.877737 1.548718 0.403034 -0.407193 0.095921 0.592941 \n",
|
||
"5 2.0 -0.425966 0.960523 1.141109 -0.168252 0.420987 -0.029728 0.476201 \n",
|
||
"6 4.0 1.229658 0.141004 0.045371 1.202613 0.191881 0.272708 -0.005159 \n",
|
||
"7 7.0 -0.644269 1.417964 1.074380 -0.492199 0.948934 0.428118 1.120631 \n",
|
||
"8 7.0 -0.894286 0.286157 -0.113192 -0.271526 2.669599 3.721818 0.370145 \n",
|
||
"9 9.0 -0.338262 1.119593 1.044367 -0.222187 0.499361 -0.246761 0.651583 \n",
|
||
"\n",
|
||
" V8 V9 ... V21 V22 V23 V24 V25 \\\n",
|
||
"0 0.098698 0.363787 ... -0.018307 0.277838 -0.110474 0.066928 0.128539 \n",
|
||
"1 0.085102 -0.255425 ... -0.225775 -0.638672 0.101288 -0.339846 0.167170 \n",
|
||
"2 0.247676 -1.514654 ... 0.247998 0.771679 0.909412 -0.689281 -0.327642 \n",
|
||
"3 0.377436 -1.387024 ... -0.108300 0.005274 -0.190321 -1.175575 0.647376 \n",
|
||
"4 -0.270533 0.817739 ... -0.009431 0.798278 -0.137458 0.141267 -0.206010 \n",
|
||
"5 0.260314 -0.568671 ... -0.208254 -0.559825 -0.026398 -0.371427 -0.232794 \n",
|
||
"6 0.081213 0.464960 ... -0.167716 -0.270710 -0.154104 -0.780055 0.750137 \n",
|
||
"7 -3.807864 0.615375 ... 1.943465 -1.015455 0.057504 -0.649709 -0.415267 \n",
|
||
"8 0.851084 -0.392048 ... -0.073425 -0.268092 -0.204233 1.011592 0.373205 \n",
|
||
"9 0.069539 -0.736727 ... -0.246914 -0.633753 -0.120794 -0.385050 -0.069733 \n",
|
||
"\n",
|
||
" V26 V27 V28 Amount Class \n",
|
||
"0 -0.189115 0.133558 -0.021053 149.62 0 \n",
|
||
"1 0.125895 -0.008983 0.014724 2.69 0 \n",
|
||
"2 -0.139097 -0.055353 -0.059752 378.66 0 \n",
|
||
"3 -0.221929 0.062723 0.061458 123.50 0 \n",
|
||
"4 0.502292 0.219422 0.215153 69.99 0 \n",
|
||
"5 0.105915 0.253844 0.081080 3.67 0 \n",
|
||
"6 -0.257237 0.034507 0.005168 4.99 0 \n",
|
||
"7 -0.051634 -1.206921 -1.085339 40.80 0 \n",
|
||
"8 -0.384157 0.011747 0.142404 93.20 0 \n",
|
||
"9 0.094199 0.246219 0.083076 3.68 0 \n",
|
||
"\n",
|
||
"[10 rows x 31 columns]"
|
||
]
|
||
},
|
||
"execution_count": 23,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"from sklearn.ensemble import RandomForestClassifier\n",
|
||
"from sklearn.model_selection import GridSearchCV\n",
|
||
"from sklearn.model_selection import train_test_split\n",
|
||
"\n",
|
||
"df = pd.read_csv('creditcard.csv')\n",
|
||
"df.head(10)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 24,
|
||
"id": "0201a188",
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/plain": [
|
||
"0 0.998273\n",
|
||
"1 0.001727\n",
|
||
"Name: Class, dtype: float64"
|
||
]
|
||
},
|
||
"execution_count": 24,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"df['Class'].value_counts(normalize=True)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 25,
|
||
"id": "9ffa6a96",
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"<class 'pandas.core.frame.DataFrame'>\n",
|
||
"RangeIndex: 284807 entries, 0 to 284806\n",
|
||
"Data columns (total 31 columns):\n",
|
||
" # Column Non-Null Count Dtype \n",
|
||
"--- ------ -------------- ----- \n",
|
||
" 0 Time 284807 non-null float64\n",
|
||
" 1 V1 284807 non-null float64\n",
|
||
" 2 V2 284807 non-null float64\n",
|
||
" 3 V3 284807 non-null float64\n",
|
||
" 4 V4 284807 non-null float64\n",
|
||
" 5 V5 284807 non-null float64\n",
|
||
" 6 V6 284807 non-null float64\n",
|
||
" 7 V7 284807 non-null float64\n",
|
||
" 8 V8 284807 non-null float64\n",
|
||
" 9 V9 284807 non-null float64\n",
|
||
" 10 V10 284807 non-null float64\n",
|
||
" 11 V11 284807 non-null float64\n",
|
||
" 12 V12 284807 non-null float64\n",
|
||
" 13 V13 284807 non-null float64\n",
|
||
" 14 V14 284807 non-null float64\n",
|
||
" 15 V15 284807 non-null float64\n",
|
||
" 16 V16 284807 non-null float64\n",
|
||
" 17 V17 284807 non-null float64\n",
|
||
" 18 V18 284807 non-null float64\n",
|
||
" 19 V19 284807 non-null float64\n",
|
||
" 20 V20 284807 non-null float64\n",
|
||
" 21 V21 284807 non-null float64\n",
|
||
" 22 V22 284807 non-null float64\n",
|
||
" 23 V23 284807 non-null float64\n",
|
||
" 24 V24 284807 non-null float64\n",
|
||
" 25 V25 284807 non-null float64\n",
|
||
" 26 V26 284807 non-null float64\n",
|
||
" 27 V27 284807 non-null float64\n",
|
||
" 28 V28 284807 non-null float64\n",
|
||
" 29 Amount 284807 non-null float64\n",
|
||
" 30 Class 284807 non-null int64 \n",
|
||
"dtypes: float64(30), int64(1)\n",
|
||
"memory usage: 67.4 MB\n"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"df.info()"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 26,
|
||
"id": "8d3439e3",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"pd.options.display.max_columns=100"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 27,
|
||
"id": "0f57a690",
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/html": [
|
||
"<div>\n",
|
||
"<style scoped>\n",
|
||
" .dataframe tbody tr th:only-of-type {\n",
|
||
" vertical-align: middle;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe tbody tr th {\n",
|
||
" vertical-align: top;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe thead th {\n",
|
||
" text-align: right;\n",
|
||
" }\n",
|
||
"</style>\n",
|
||
"<table border=\"1\" class=\"dataframe\">\n",
|
||
" <thead>\n",
|
||
" <tr style=\"text-align: right;\">\n",
|
||
" <th></th>\n",
|
||
" <th>Time</th>\n",
|
||
" <th>V1</th>\n",
|
||
" <th>V2</th>\n",
|
||
" <th>V3</th>\n",
|
||
" <th>V4</th>\n",
|
||
" <th>V5</th>\n",
|
||
" <th>V6</th>\n",
|
||
" <th>V7</th>\n",
|
||
" <th>V8</th>\n",
|
||
" <th>V9</th>\n",
|
||
" <th>V10</th>\n",
|
||
" <th>V11</th>\n",
|
||
" <th>V12</th>\n",
|
||
" <th>V13</th>\n",
|
||
" <th>V14</th>\n",
|
||
" <th>V15</th>\n",
|
||
" <th>V16</th>\n",
|
||
" <th>V17</th>\n",
|
||
" <th>V18</th>\n",
|
||
" <th>V19</th>\n",
|
||
" <th>V20</th>\n",
|
||
" <th>V21</th>\n",
|
||
" <th>V22</th>\n",
|
||
" <th>V23</th>\n",
|
||
" <th>V24</th>\n",
|
||
" <th>V25</th>\n",
|
||
" <th>V26</th>\n",
|
||
" <th>V27</th>\n",
|
||
" <th>V28</th>\n",
|
||
" <th>Amount</th>\n",
|
||
" <th>Class</th>\n",
|
||
" </tr>\n",
|
||
" </thead>\n",
|
||
" <tbody>\n",
|
||
" <tr>\n",
|
||
" <th>0</th>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>-1.359807</td>\n",
|
||
" <td>-0.072781</td>\n",
|
||
" <td>2.536347</td>\n",
|
||
" <td>1.378155</td>\n",
|
||
" <td>-0.338321</td>\n",
|
||
" <td>0.462388</td>\n",
|
||
" <td>0.239599</td>\n",
|
||
" <td>0.098698</td>\n",
|
||
" <td>0.363787</td>\n",
|
||
" <td>0.090794</td>\n",
|
||
" <td>-0.551600</td>\n",
|
||
" <td>-0.617801</td>\n",
|
||
" <td>-0.991390</td>\n",
|
||
" <td>-0.311169</td>\n",
|
||
" <td>1.468177</td>\n",
|
||
" <td>-0.470401</td>\n",
|
||
" <td>0.207971</td>\n",
|
||
" <td>0.025791</td>\n",
|
||
" <td>0.403993</td>\n",
|
||
" <td>0.251412</td>\n",
|
||
" <td>-0.018307</td>\n",
|
||
" <td>0.277838</td>\n",
|
||
" <td>-0.110474</td>\n",
|
||
" <td>0.066928</td>\n",
|
||
" <td>0.128539</td>\n",
|
||
" <td>-0.189115</td>\n",
|
||
" <td>0.133558</td>\n",
|
||
" <td>-0.021053</td>\n",
|
||
" <td>149.62</td>\n",
|
||
" <td>0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>1</th>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>1.191857</td>\n",
|
||
" <td>0.266151</td>\n",
|
||
" <td>0.166480</td>\n",
|
||
" <td>0.448154</td>\n",
|
||
" <td>0.060018</td>\n",
|
||
" <td>-0.082361</td>\n",
|
||
" <td>-0.078803</td>\n",
|
||
" <td>0.085102</td>\n",
|
||
" <td>-0.255425</td>\n",
|
||
" <td>-0.166974</td>\n",
|
||
" <td>1.612727</td>\n",
|
||
" <td>1.065235</td>\n",
|
||
" <td>0.489095</td>\n",
|
||
" <td>-0.143772</td>\n",
|
||
" <td>0.635558</td>\n",
|
||
" <td>0.463917</td>\n",
|
||
" <td>-0.114805</td>\n",
|
||
" <td>-0.183361</td>\n",
|
||
" <td>-0.145783</td>\n",
|
||
" <td>-0.069083</td>\n",
|
||
" <td>-0.225775</td>\n",
|
||
" <td>-0.638672</td>\n",
|
||
" <td>0.101288</td>\n",
|
||
" <td>-0.339846</td>\n",
|
||
" <td>0.167170</td>\n",
|
||
" <td>0.125895</td>\n",
|
||
" <td>-0.008983</td>\n",
|
||
" <td>0.014724</td>\n",
|
||
" <td>2.69</td>\n",
|
||
" <td>0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>2</th>\n",
|
||
" <td>1.0</td>\n",
|
||
" <td>-1.358354</td>\n",
|
||
" <td>-1.340163</td>\n",
|
||
" <td>1.773209</td>\n",
|
||
" <td>0.379780</td>\n",
|
||
" <td>-0.503198</td>\n",
|
||
" <td>1.800499</td>\n",
|
||
" <td>0.791461</td>\n",
|
||
" <td>0.247676</td>\n",
|
||
" <td>-1.514654</td>\n",
|
||
" <td>0.207643</td>\n",
|
||
" <td>0.624501</td>\n",
|
||
" <td>0.066084</td>\n",
|
||
" <td>0.717293</td>\n",
|
||
" <td>-0.165946</td>\n",
|
||
" <td>2.345865</td>\n",
|
||
" <td>-2.890083</td>\n",
|
||
" <td>1.109969</td>\n",
|
||
" <td>-0.121359</td>\n",
|
||
" <td>-2.261857</td>\n",
|
||
" <td>0.524980</td>\n",
|
||
" <td>0.247998</td>\n",
|
||
" <td>0.771679</td>\n",
|
||
" <td>0.909412</td>\n",
|
||
" <td>-0.689281</td>\n",
|
||
" <td>-0.327642</td>\n",
|
||
" <td>-0.139097</td>\n",
|
||
" <td>-0.055353</td>\n",
|
||
" <td>-0.059752</td>\n",
|
||
" <td>378.66</td>\n",
|
||
" <td>0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>3</th>\n",
|
||
" <td>1.0</td>\n",
|
||
" <td>-0.966272</td>\n",
|
||
" <td>-0.185226</td>\n",
|
||
" <td>1.792993</td>\n",
|
||
" <td>-0.863291</td>\n",
|
||
" <td>-0.010309</td>\n",
|
||
" <td>1.247203</td>\n",
|
||
" <td>0.237609</td>\n",
|
||
" <td>0.377436</td>\n",
|
||
" <td>-1.387024</td>\n",
|
||
" <td>-0.054952</td>\n",
|
||
" <td>-0.226487</td>\n",
|
||
" <td>0.178228</td>\n",
|
||
" <td>0.507757</td>\n",
|
||
" <td>-0.287924</td>\n",
|
||
" <td>-0.631418</td>\n",
|
||
" <td>-1.059647</td>\n",
|
||
" <td>-0.684093</td>\n",
|
||
" <td>1.965775</td>\n",
|
||
" <td>-1.232622</td>\n",
|
||
" <td>-0.208038</td>\n",
|
||
" <td>-0.108300</td>\n",
|
||
" <td>0.005274</td>\n",
|
||
" <td>-0.190321</td>\n",
|
||
" <td>-1.175575</td>\n",
|
||
" <td>0.647376</td>\n",
|
||
" <td>-0.221929</td>\n",
|
||
" <td>0.062723</td>\n",
|
||
" <td>0.061458</td>\n",
|
||
" <td>123.50</td>\n",
|
||
" <td>0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>4</th>\n",
|
||
" <td>2.0</td>\n",
|
||
" <td>-1.158233</td>\n",
|
||
" <td>0.877737</td>\n",
|
||
" <td>1.548718</td>\n",
|
||
" <td>0.403034</td>\n",
|
||
" <td>-0.407193</td>\n",
|
||
" <td>0.095921</td>\n",
|
||
" <td>0.592941</td>\n",
|
||
" <td>-0.270533</td>\n",
|
||
" <td>0.817739</td>\n",
|
||
" <td>0.753074</td>\n",
|
||
" <td>-0.822843</td>\n",
|
||
" <td>0.538196</td>\n",
|
||
" <td>1.345852</td>\n",
|
||
" <td>-1.119670</td>\n",
|
||
" <td>0.175121</td>\n",
|
||
" <td>-0.451449</td>\n",
|
||
" <td>-0.237033</td>\n",
|
||
" <td>-0.038195</td>\n",
|
||
" <td>0.803487</td>\n",
|
||
" <td>0.408542</td>\n",
|
||
" <td>-0.009431</td>\n",
|
||
" <td>0.798278</td>\n",
|
||
" <td>-0.137458</td>\n",
|
||
" <td>0.141267</td>\n",
|
||
" <td>-0.206010</td>\n",
|
||
" <td>0.502292</td>\n",
|
||
" <td>0.219422</td>\n",
|
||
" <td>0.215153</td>\n",
|
||
" <td>69.99</td>\n",
|
||
" <td>0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>5</th>\n",
|
||
" <td>2.0</td>\n",
|
||
" <td>-0.425966</td>\n",
|
||
" <td>0.960523</td>\n",
|
||
" <td>1.141109</td>\n",
|
||
" <td>-0.168252</td>\n",
|
||
" <td>0.420987</td>\n",
|
||
" <td>-0.029728</td>\n",
|
||
" <td>0.476201</td>\n",
|
||
" <td>0.260314</td>\n",
|
||
" <td>-0.568671</td>\n",
|
||
" <td>-0.371407</td>\n",
|
||
" <td>1.341262</td>\n",
|
||
" <td>0.359894</td>\n",
|
||
" <td>-0.358091</td>\n",
|
||
" <td>-0.137134</td>\n",
|
||
" <td>0.517617</td>\n",
|
||
" <td>0.401726</td>\n",
|
||
" <td>-0.058133</td>\n",
|
||
" <td>0.068653</td>\n",
|
||
" <td>-0.033194</td>\n",
|
||
" <td>0.084968</td>\n",
|
||
" <td>-0.208254</td>\n",
|
||
" <td>-0.559825</td>\n",
|
||
" <td>-0.026398</td>\n",
|
||
" <td>-0.371427</td>\n",
|
||
" <td>-0.232794</td>\n",
|
||
" <td>0.105915</td>\n",
|
||
" <td>0.253844</td>\n",
|
||
" <td>0.081080</td>\n",
|
||
" <td>3.67</td>\n",
|
||
" <td>0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>6</th>\n",
|
||
" <td>4.0</td>\n",
|
||
" <td>1.229658</td>\n",
|
||
" <td>0.141004</td>\n",
|
||
" <td>0.045371</td>\n",
|
||
" <td>1.202613</td>\n",
|
||
" <td>0.191881</td>\n",
|
||
" <td>0.272708</td>\n",
|
||
" <td>-0.005159</td>\n",
|
||
" <td>0.081213</td>\n",
|
||
" <td>0.464960</td>\n",
|
||
" <td>-0.099254</td>\n",
|
||
" <td>-1.416907</td>\n",
|
||
" <td>-0.153826</td>\n",
|
||
" <td>-0.751063</td>\n",
|
||
" <td>0.167372</td>\n",
|
||
" <td>0.050144</td>\n",
|
||
" <td>-0.443587</td>\n",
|
||
" <td>0.002821</td>\n",
|
||
" <td>-0.611987</td>\n",
|
||
" <td>-0.045575</td>\n",
|
||
" <td>-0.219633</td>\n",
|
||
" <td>-0.167716</td>\n",
|
||
" <td>-0.270710</td>\n",
|
||
" <td>-0.154104</td>\n",
|
||
" <td>-0.780055</td>\n",
|
||
" <td>0.750137</td>\n",
|
||
" <td>-0.257237</td>\n",
|
||
" <td>0.034507</td>\n",
|
||
" <td>0.005168</td>\n",
|
||
" <td>4.99</td>\n",
|
||
" <td>0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>7</th>\n",
|
||
" <td>7.0</td>\n",
|
||
" <td>-0.644269</td>\n",
|
||
" <td>1.417964</td>\n",
|
||
" <td>1.074380</td>\n",
|
||
" <td>-0.492199</td>\n",
|
||
" <td>0.948934</td>\n",
|
||
" <td>0.428118</td>\n",
|
||
" <td>1.120631</td>\n",
|
||
" <td>-3.807864</td>\n",
|
||
" <td>0.615375</td>\n",
|
||
" <td>1.249376</td>\n",
|
||
" <td>-0.619468</td>\n",
|
||
" <td>0.291474</td>\n",
|
||
" <td>1.757964</td>\n",
|
||
" <td>-1.323865</td>\n",
|
||
" <td>0.686133</td>\n",
|
||
" <td>-0.076127</td>\n",
|
||
" <td>-1.222127</td>\n",
|
||
" <td>-0.358222</td>\n",
|
||
" <td>0.324505</td>\n",
|
||
" <td>-0.156742</td>\n",
|
||
" <td>1.943465</td>\n",
|
||
" <td>-1.015455</td>\n",
|
||
" <td>0.057504</td>\n",
|
||
" <td>-0.649709</td>\n",
|
||
" <td>-0.415267</td>\n",
|
||
" <td>-0.051634</td>\n",
|
||
" <td>-1.206921</td>\n",
|
||
" <td>-1.085339</td>\n",
|
||
" <td>40.80</td>\n",
|
||
" <td>0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>8</th>\n",
|
||
" <td>7.0</td>\n",
|
||
" <td>-0.894286</td>\n",
|
||
" <td>0.286157</td>\n",
|
||
" <td>-0.113192</td>\n",
|
||
" <td>-0.271526</td>\n",
|
||
" <td>2.669599</td>\n",
|
||
" <td>3.721818</td>\n",
|
||
" <td>0.370145</td>\n",
|
||
" <td>0.851084</td>\n",
|
||
" <td>-0.392048</td>\n",
|
||
" <td>-0.410430</td>\n",
|
||
" <td>-0.705117</td>\n",
|
||
" <td>-0.110452</td>\n",
|
||
" <td>-0.286254</td>\n",
|
||
" <td>0.074355</td>\n",
|
||
" <td>-0.328783</td>\n",
|
||
" <td>-0.210077</td>\n",
|
||
" <td>-0.499768</td>\n",
|
||
" <td>0.118765</td>\n",
|
||
" <td>0.570328</td>\n",
|
||
" <td>0.052736</td>\n",
|
||
" <td>-0.073425</td>\n",
|
||
" <td>-0.268092</td>\n",
|
||
" <td>-0.204233</td>\n",
|
||
" <td>1.011592</td>\n",
|
||
" <td>0.373205</td>\n",
|
||
" <td>-0.384157</td>\n",
|
||
" <td>0.011747</td>\n",
|
||
" <td>0.142404</td>\n",
|
||
" <td>93.20</td>\n",
|
||
" <td>0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>9</th>\n",
|
||
" <td>9.0</td>\n",
|
||
" <td>-0.338262</td>\n",
|
||
" <td>1.119593</td>\n",
|
||
" <td>1.044367</td>\n",
|
||
" <td>-0.222187</td>\n",
|
||
" <td>0.499361</td>\n",
|
||
" <td>-0.246761</td>\n",
|
||
" <td>0.651583</td>\n",
|
||
" <td>0.069539</td>\n",
|
||
" <td>-0.736727</td>\n",
|
||
" <td>-0.366846</td>\n",
|
||
" <td>1.017614</td>\n",
|
||
" <td>0.836390</td>\n",
|
||
" <td>1.006844</td>\n",
|
||
" <td>-0.443523</td>\n",
|
||
" <td>0.150219</td>\n",
|
||
" <td>0.739453</td>\n",
|
||
" <td>-0.540980</td>\n",
|
||
" <td>0.476677</td>\n",
|
||
" <td>0.451773</td>\n",
|
||
" <td>0.203711</td>\n",
|
||
" <td>-0.246914</td>\n",
|
||
" <td>-0.633753</td>\n",
|
||
" <td>-0.120794</td>\n",
|
||
" <td>-0.385050</td>\n",
|
||
" <td>-0.069733</td>\n",
|
||
" <td>0.094199</td>\n",
|
||
" <td>0.246219</td>\n",
|
||
" <td>0.083076</td>\n",
|
||
" <td>3.68</td>\n",
|
||
" <td>0</td>\n",
|
||
" </tr>\n",
|
||
" </tbody>\n",
|
||
"</table>\n",
|
||
"</div>"
|
||
],
|
||
"text/plain": [
|
||
" Time V1 V2 V3 V4 V5 V6 V7 \\\n",
|
||
"0 0.0 -1.359807 -0.072781 2.536347 1.378155 -0.338321 0.462388 0.239599 \n",
|
||
"1 0.0 1.191857 0.266151 0.166480 0.448154 0.060018 -0.082361 -0.078803 \n",
|
||
"2 1.0 -1.358354 -1.340163 1.773209 0.379780 -0.503198 1.800499 0.791461 \n",
|
||
"3 1.0 -0.966272 -0.185226 1.792993 -0.863291 -0.010309 1.247203 0.237609 \n",
|
||
"4 2.0 -1.158233 0.877737 1.548718 0.403034 -0.407193 0.095921 0.592941 \n",
|
||
"5 2.0 -0.425966 0.960523 1.141109 -0.168252 0.420987 -0.029728 0.476201 \n",
|
||
"6 4.0 1.229658 0.141004 0.045371 1.202613 0.191881 0.272708 -0.005159 \n",
|
||
"7 7.0 -0.644269 1.417964 1.074380 -0.492199 0.948934 0.428118 1.120631 \n",
|
||
"8 7.0 -0.894286 0.286157 -0.113192 -0.271526 2.669599 3.721818 0.370145 \n",
|
||
"9 9.0 -0.338262 1.119593 1.044367 -0.222187 0.499361 -0.246761 0.651583 \n",
|
||
"\n",
|
||
" V8 V9 V10 V11 V12 V13 V14 \\\n",
|
||
"0 0.098698 0.363787 0.090794 -0.551600 -0.617801 -0.991390 -0.311169 \n",
|
||
"1 0.085102 -0.255425 -0.166974 1.612727 1.065235 0.489095 -0.143772 \n",
|
||
"2 0.247676 -1.514654 0.207643 0.624501 0.066084 0.717293 -0.165946 \n",
|
||
"3 0.377436 -1.387024 -0.054952 -0.226487 0.178228 0.507757 -0.287924 \n",
|
||
"4 -0.270533 0.817739 0.753074 -0.822843 0.538196 1.345852 -1.119670 \n",
|
||
"5 0.260314 -0.568671 -0.371407 1.341262 0.359894 -0.358091 -0.137134 \n",
|
||
"6 0.081213 0.464960 -0.099254 -1.416907 -0.153826 -0.751063 0.167372 \n",
|
||
"7 -3.807864 0.615375 1.249376 -0.619468 0.291474 1.757964 -1.323865 \n",
|
||
"8 0.851084 -0.392048 -0.410430 -0.705117 -0.110452 -0.286254 0.074355 \n",
|
||
"9 0.069539 -0.736727 -0.366846 1.017614 0.836390 1.006844 -0.443523 \n",
|
||
"\n",
|
||
" V15 V16 V17 V18 V19 V20 V21 \\\n",
|
||
"0 1.468177 -0.470401 0.207971 0.025791 0.403993 0.251412 -0.018307 \n",
|
||
"1 0.635558 0.463917 -0.114805 -0.183361 -0.145783 -0.069083 -0.225775 \n",
|
||
"2 2.345865 -2.890083 1.109969 -0.121359 -2.261857 0.524980 0.247998 \n",
|
||
"3 -0.631418 -1.059647 -0.684093 1.965775 -1.232622 -0.208038 -0.108300 \n",
|
||
"4 0.175121 -0.451449 -0.237033 -0.038195 0.803487 0.408542 -0.009431 \n",
|
||
"5 0.517617 0.401726 -0.058133 0.068653 -0.033194 0.084968 -0.208254 \n",
|
||
"6 0.050144 -0.443587 0.002821 -0.611987 -0.045575 -0.219633 -0.167716 \n",
|
||
"7 0.686133 -0.076127 -1.222127 -0.358222 0.324505 -0.156742 1.943465 \n",
|
||
"8 -0.328783 -0.210077 -0.499768 0.118765 0.570328 0.052736 -0.073425 \n",
|
||
"9 0.150219 0.739453 -0.540980 0.476677 0.451773 0.203711 -0.246914 \n",
|
||
"\n",
|
||
" V22 V23 V24 V25 V26 V27 V28 \\\n",
|
||
"0 0.277838 -0.110474 0.066928 0.128539 -0.189115 0.133558 -0.021053 \n",
|
||
"1 -0.638672 0.101288 -0.339846 0.167170 0.125895 -0.008983 0.014724 \n",
|
||
"2 0.771679 0.909412 -0.689281 -0.327642 -0.139097 -0.055353 -0.059752 \n",
|
||
"3 0.005274 -0.190321 -1.175575 0.647376 -0.221929 0.062723 0.061458 \n",
|
||
"4 0.798278 -0.137458 0.141267 -0.206010 0.502292 0.219422 0.215153 \n",
|
||
"5 -0.559825 -0.026398 -0.371427 -0.232794 0.105915 0.253844 0.081080 \n",
|
||
"6 -0.270710 -0.154104 -0.780055 0.750137 -0.257237 0.034507 0.005168 \n",
|
||
"7 -1.015455 0.057504 -0.649709 -0.415267 -0.051634 -1.206921 -1.085339 \n",
|
||
"8 -0.268092 -0.204233 1.011592 0.373205 -0.384157 0.011747 0.142404 \n",
|
||
"9 -0.633753 -0.120794 -0.385050 -0.069733 0.094199 0.246219 0.083076 \n",
|
||
"\n",
|
||
" Amount Class \n",
|
||
"0 149.62 0 \n",
|
||
"1 2.69 0 \n",
|
||
"2 378.66 0 \n",
|
||
"3 123.50 0 \n",
|
||
"4 69.99 0 \n",
|
||
"5 3.67 0 \n",
|
||
"6 4.99 0 \n",
|
||
"7 40.80 0 \n",
|
||
"8 93.20 0 \n",
|
||
"9 3.68 0 "
|
||
]
|
||
},
|
||
"execution_count": 27,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"df.head(10)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 28,
|
||
"id": "3530430c",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"X = df.drop(\"Class\", axis=1)\n",
|
||
"y = df[\"Class\"]"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 29,
|
||
"id": "c66ea2e2",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=100, stratify=y)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 30,
|
||
"id": "1e628dd0",
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"X_train (199364, 30)\n",
|
||
"X_test (85443, 30)\n",
|
||
"y_train (199364,)\n",
|
||
"y_test (85443,)\n"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"print('X_train ', X_train.shape)\n",
|
||
"print('X_test ', X_test.shape)\n",
|
||
"print('y_train ', y_train.shape)\n",
|
||
"print('y_test ', y_test.shape)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 31,
|
||
"id": "8908147f",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"parameters = [{'n_estimators': [10, 15],'max_features': np.arange(3, 5),'max_depth': np.arange(4, 7)}]"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 32,
|
||
"id": "8f593bef",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"clf = GridSearchCV(\n",
|
||
" estimator=RandomForestClassifier(random_state=100),\n",
|
||
" param_grid=parameters,\n",
|
||
" scoring='roc_auc',\n",
|
||
" cv=3,\n",
|
||
")"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 33,
|
||
"id": "99d17337",
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/plain": [
|
||
"GridSearchCV(cv=3, estimator=RandomForestClassifier(random_state=100),\n",
|
||
" param_grid=[{'max_depth': array([4, 5, 6]),\n",
|
||
" 'max_features': array([3, 4]),\n",
|
||
" 'n_estimators': [10, 15]}],\n",
|
||
" scoring='roc_auc')"
|
||
]
|
||
},
|
||
"execution_count": 33,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"clf.fit(X_train, y_train)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 34,
|
||
"id": "79f8c7e5",
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/plain": [
|
||
"{'max_depth': 6, 'max_features': 3, 'n_estimators': 15}"
|
||
]
|
||
},
|
||
"execution_count": 34,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"clf.best_params_"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 35,
|
||
"id": "74efab0a",
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/plain": [
|
||
"RandomForestClassifier(max_depth=6, max_features=3, n_estimators=15)"
|
||
]
|
||
},
|
||
"execution_count": 35,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"clf = RandomForestClassifier(max_depth=6, max_features=3, n_estimators=15)\n",
|
||
"\n",
|
||
"clf.fit(X_train, y_train)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 36,
|
||
"id": "e4d1fe4a",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"y_pred = clf.predict_proba(X_test)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 37,
|
||
"id": "49357f79",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"y_pred_proba = y_pred[:, 1]"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 38,
|
||
"id": "fc40ec74",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"from sklearn.metrics import roc_auc_score"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 39,
|
||
"id": "84b0112b",
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/plain": [
|
||
"0.9476239854368701"
|
||
]
|
||
},
|
||
"execution_count": 39,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"roc_auc_score(y_test, y_pred_proba)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"id": "475ac08f",
|
||
"metadata": {},
|
||
"source": [
|
||
"# *Дополнительные задания:"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"id": "3728fe05",
|
||
"metadata": {},
|
||
"source": [
|
||
"Загрузите датасет Wine из встроенных датасетов sklearn.datasets с помощью функции load_wine в переменную data."
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 40,
|
||
"id": "e0600074",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"from sklearn.datasets import load_wine\n",
|
||
"data = load_wine()"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"id": "6e22dfd0",
|
||
"metadata": {},
|
||
"source": [
|
||
"Полученный датасет не является датафреймом. Это структура данных, имеющая ключи аналогично словарю. Просмотрите тип данных этой структуры данных и создайте список data_keys, содержащий ее ключи."
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 41,
|
||
"id": "1cc31a29",
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"<class 'sklearn.utils.Bunch'> \n",
|
||
"\n",
|
||
"dict_keys(['data', 'target', 'frame', 'target_names', 'DESCR', 'feature_names'])\n"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"print(type(data), '\\n')\n",
|
||
"data_keys = data.keys()\n",
|
||
"print(data_keys)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"id": "05b0491c",
|
||
"metadata": {},
|
||
"source": [
|
||
"Просмотрите данные, описание и названия признаков в датасете. Описание нужно вывести в виде привычного, аккуратно оформленного текста, без обозначений переноса строки, но с самими переносами и т.д"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 42,
|
||
"id": "67997daa",
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/plain": [
|
||
"array([[1.423e+01, 1.710e+00, 2.430e+00, ..., 1.040e+00, 3.920e+00,\n",
|
||
" 1.065e+03],\n",
|
||
" [1.320e+01, 1.780e+00, 2.140e+00, ..., 1.050e+00, 3.400e+00,\n",
|
||
" 1.050e+03],\n",
|
||
" [1.316e+01, 2.360e+00, 2.670e+00, ..., 1.030e+00, 3.170e+00,\n",
|
||
" 1.185e+03],\n",
|
||
" ...,\n",
|
||
" [1.327e+01, 4.280e+00, 2.260e+00, ..., 5.900e-01, 1.560e+00,\n",
|
||
" 8.350e+02],\n",
|
||
" [1.317e+01, 2.590e+00, 2.370e+00, ..., 6.000e-01, 1.620e+00,\n",
|
||
" 8.400e+02],\n",
|
||
" [1.413e+01, 4.100e+00, 2.740e+00, ..., 6.100e-01, 1.600e+00,\n",
|
||
" 5.600e+02]])"
|
||
]
|
||
},
|
||
"execution_count": 42,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"data.data"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 43,
|
||
"id": "42bf0f1f",
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
".. _wine_dataset:\n",
|
||
"\n",
|
||
"Wine recognition dataset\n",
|
||
"------------------------\n",
|
||
"\n",
|
||
"**Data Set Characteristics:**\n",
|
||
"\n",
|
||
" :Number of Instances: 178 (50 in each of three classes)\n",
|
||
" :Number of Attributes: 13 numeric, predictive attributes and the class\n",
|
||
" :Attribute Information:\n",
|
||
" \t\t- Alcohol\n",
|
||
" \t\t- Malic acid\n",
|
||
" \t\t- Ash\n",
|
||
"\t\t- Alcalinity of ash \n",
|
||
" \t\t- Magnesium\n",
|
||
"\t\t- Total phenols\n",
|
||
" \t\t- Flavanoids\n",
|
||
" \t\t- Nonflavanoid phenols\n",
|
||
" \t\t- Proanthocyanins\n",
|
||
"\t\t- Color intensity\n",
|
||
" \t\t- Hue\n",
|
||
" \t\t- OD280/OD315 of diluted wines\n",
|
||
" \t\t- Proline\n",
|
||
"\n",
|
||
" - class:\n",
|
||
" - class_0\n",
|
||
" - class_1\n",
|
||
" - class_2\n",
|
||
"\t\t\n",
|
||
" :Summary Statistics:\n",
|
||
" \n",
|
||
" ============================= ==== ===== ======= =====\n",
|
||
" Min Max Mean SD\n",
|
||
" ============================= ==== ===== ======= =====\n",
|
||
" Alcohol: 11.0 14.8 13.0 0.8\n",
|
||
" Malic Acid: 0.74 5.80 2.34 1.12\n",
|
||
" Ash: 1.36 3.23 2.36 0.27\n",
|
||
" Alcalinity of Ash: 10.6 30.0 19.5 3.3\n",
|
||
" Magnesium: 70.0 162.0 99.7 14.3\n",
|
||
" Total Phenols: 0.98 3.88 2.29 0.63\n",
|
||
" Flavanoids: 0.34 5.08 2.03 1.00\n",
|
||
" Nonflavanoid Phenols: 0.13 0.66 0.36 0.12\n",
|
||
" Proanthocyanins: 0.41 3.58 1.59 0.57\n",
|
||
" Colour Intensity: 1.3 13.0 5.1 2.3\n",
|
||
" Hue: 0.48 1.71 0.96 0.23\n",
|
||
" OD280/OD315 of diluted wines: 1.27 4.00 2.61 0.71\n",
|
||
" Proline: 278 1680 746 315\n",
|
||
" ============================= ==== ===== ======= =====\n",
|
||
"\n",
|
||
" :Missing Attribute Values: None\n",
|
||
" :Class Distribution: class_0 (59), class_1 (71), class_2 (48)\n",
|
||
" :Creator: R.A. Fisher\n",
|
||
" :Donor: Michael Marshall (MARSHALL%PLU@io.arc.nasa.gov)\n",
|
||
" :Date: July, 1988\n",
|
||
"\n",
|
||
"This is a copy of UCI ML Wine recognition datasets.\n",
|
||
"https://archive.ics.uci.edu/ml/machine-learning-databases/wine/wine.data\n",
|
||
"\n",
|
||
"The data is the results of a chemical analysis of wines grown in the same\n",
|
||
"region in Italy by three different cultivators. There are thirteen different\n",
|
||
"measurements taken for different constituents found in the three types of\n",
|
||
"wine.\n",
|
||
"\n",
|
||
"Original Owners: \n",
|
||
"\n",
|
||
"Forina, M. et al, PARVUS - \n",
|
||
"An Extendible Package for Data Exploration, Classification and Correlation. \n",
|
||
"Institute of Pharmaceutical and Food Analysis and Technologies,\n",
|
||
"Via Brigata Salerno, 16147 Genoa, Italy.\n",
|
||
"\n",
|
||
"Citation:\n",
|
||
"\n",
|
||
"Lichman, M. (2013). UCI Machine Learning Repository\n",
|
||
"[https://archive.ics.uci.edu/ml]. Irvine, CA: University of California,\n",
|
||
"School of Information and Computer Science. \n",
|
||
"\n",
|
||
".. topic:: References\n",
|
||
"\n",
|
||
" (1) S. Aeberhard, D. Coomans and O. de Vel, \n",
|
||
" Comparison of Classifiers in High Dimensional Settings, \n",
|
||
" Tech. Rep. no. 92-02, (1992), Dept. of Computer Science and Dept. of \n",
|
||
" Mathematics and Statistics, James Cook University of North Queensland. \n",
|
||
" (Also submitted to Technometrics). \n",
|
||
"\n",
|
||
" The data was used with many others for comparing various \n",
|
||
" classifiers. The classes are separable, though only RDA \n",
|
||
" has achieved 100% correct classification. \n",
|
||
" (RDA : 100%, QDA 99.4%, LDA 98.9%, 1NN 96.1% (z-transformed data)) \n",
|
||
" (All results using the leave-one-out technique) \n",
|
||
"\n",
|
||
" (2) S. Aeberhard, D. Coomans and O. de Vel, \n",
|
||
" \"THE CLASSIFICATION PERFORMANCE OF RDA\" \n",
|
||
" Tech. Rep. no. 92-01, (1992), Dept. of Computer Science and Dept. of \n",
|
||
" Mathematics and Statistics, James Cook University of North Queensland. \n",
|
||
" (Also submitted to Journal of Chemometrics).\n",
|
||
"\n"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"print(data.DESCR)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 44,
|
||
"id": "3990394e",
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/plain": [
|
||
"['alcohol',\n",
|
||
" 'malic_acid',\n",
|
||
" 'ash',\n",
|
||
" 'alcalinity_of_ash',\n",
|
||
" 'magnesium',\n",
|
||
" 'total_phenols',\n",
|
||
" 'flavanoids',\n",
|
||
" 'nonflavanoid_phenols',\n",
|
||
" 'proanthocyanins',\n",
|
||
" 'color_intensity',\n",
|
||
" 'hue',\n",
|
||
" 'od280/od315_of_diluted_wines',\n",
|
||
" 'proline']"
|
||
]
|
||
},
|
||
"execution_count": 44,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"data.feature_names"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"id": "ca3e3b90",
|
||
"metadata": {},
|
||
"source": [
|
||
"Сколько классов содержит целевая переменная датасета? Выведите названия классов."
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 45,
|
||
"id": "3dcc2473",
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"Количество классов: (3,) \n",
|
||
"\n",
|
||
"Названия классов: ['class_0' 'class_1' 'class_2']\n"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"print('Количество классов: ', np.unique(data[\"target\"]).shape, '\\n')\n",
|
||
"print('Названия классов: ',data[\"target_names\"])"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"id": "4eb0d981",
|
||
"metadata": {},
|
||
"source": [
|
||
"На основе данных датасета (они содержатся в двумерном массиве Numpy) и названий признаков создайте датафрейм под названием X."
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 46,
|
||
"id": "52257354",
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/html": [
|
||
"<div>\n",
|
||
"<style scoped>\n",
|
||
" .dataframe tbody tr th:only-of-type {\n",
|
||
" vertical-align: middle;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe tbody tr th {\n",
|
||
" vertical-align: top;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe thead th {\n",
|
||
" text-align: right;\n",
|
||
" }\n",
|
||
"</style>\n",
|
||
"<table border=\"1\" class=\"dataframe\">\n",
|
||
" <thead>\n",
|
||
" <tr style=\"text-align: right;\">\n",
|
||
" <th></th>\n",
|
||
" <th>alcohol</th>\n",
|
||
" <th>malic_acid</th>\n",
|
||
" <th>ash</th>\n",
|
||
" <th>alcalinity_of_ash</th>\n",
|
||
" <th>magnesium</th>\n",
|
||
" <th>total_phenols</th>\n",
|
||
" <th>flavanoids</th>\n",
|
||
" <th>nonflavanoid_phenols</th>\n",
|
||
" <th>proanthocyanins</th>\n",
|
||
" <th>color_intensity</th>\n",
|
||
" <th>hue</th>\n",
|
||
" <th>od280/od315_of_diluted_wines</th>\n",
|
||
" <th>proline</th>\n",
|
||
" </tr>\n",
|
||
" </thead>\n",
|
||
" <tbody>\n",
|
||
" <tr>\n",
|
||
" <th>0</th>\n",
|
||
" <td>14.23</td>\n",
|
||
" <td>1.71</td>\n",
|
||
" <td>2.43</td>\n",
|
||
" <td>15.6</td>\n",
|
||
" <td>127.0</td>\n",
|
||
" <td>2.80</td>\n",
|
||
" <td>3.06</td>\n",
|
||
" <td>0.28</td>\n",
|
||
" <td>2.29</td>\n",
|
||
" <td>5.64</td>\n",
|
||
" <td>1.04</td>\n",
|
||
" <td>3.92</td>\n",
|
||
" <td>1065.0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>1</th>\n",
|
||
" <td>13.20</td>\n",
|
||
" <td>1.78</td>\n",
|
||
" <td>2.14</td>\n",
|
||
" <td>11.2</td>\n",
|
||
" <td>100.0</td>\n",
|
||
" <td>2.65</td>\n",
|
||
" <td>2.76</td>\n",
|
||
" <td>0.26</td>\n",
|
||
" <td>1.28</td>\n",
|
||
" <td>4.38</td>\n",
|
||
" <td>1.05</td>\n",
|
||
" <td>3.40</td>\n",
|
||
" <td>1050.0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>2</th>\n",
|
||
" <td>13.16</td>\n",
|
||
" <td>2.36</td>\n",
|
||
" <td>2.67</td>\n",
|
||
" <td>18.6</td>\n",
|
||
" <td>101.0</td>\n",
|
||
" <td>2.80</td>\n",
|
||
" <td>3.24</td>\n",
|
||
" <td>0.30</td>\n",
|
||
" <td>2.81</td>\n",
|
||
" <td>5.68</td>\n",
|
||
" <td>1.03</td>\n",
|
||
" <td>3.17</td>\n",
|
||
" <td>1185.0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>3</th>\n",
|
||
" <td>14.37</td>\n",
|
||
" <td>1.95</td>\n",
|
||
" <td>2.50</td>\n",
|
||
" <td>16.8</td>\n",
|
||
" <td>113.0</td>\n",
|
||
" <td>3.85</td>\n",
|
||
" <td>3.49</td>\n",
|
||
" <td>0.24</td>\n",
|
||
" <td>2.18</td>\n",
|
||
" <td>7.80</td>\n",
|
||
" <td>0.86</td>\n",
|
||
" <td>3.45</td>\n",
|
||
" <td>1480.0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>4</th>\n",
|
||
" <td>13.24</td>\n",
|
||
" <td>2.59</td>\n",
|
||
" <td>2.87</td>\n",
|
||
" <td>21.0</td>\n",
|
||
" <td>118.0</td>\n",
|
||
" <td>2.80</td>\n",
|
||
" <td>2.69</td>\n",
|
||
" <td>0.39</td>\n",
|
||
" <td>1.82</td>\n",
|
||
" <td>4.32</td>\n",
|
||
" <td>1.04</td>\n",
|
||
" <td>2.93</td>\n",
|
||
" <td>735.0</td>\n",
|
||
" </tr>\n",
|
||
" </tbody>\n",
|
||
"</table>\n",
|
||
"</div>"
|
||
],
|
||
"text/plain": [
|
||
" alcohol malic_acid ash alcalinity_of_ash magnesium total_phenols \\\n",
|
||
"0 14.23 1.71 2.43 15.6 127.0 2.80 \n",
|
||
"1 13.20 1.78 2.14 11.2 100.0 2.65 \n",
|
||
"2 13.16 2.36 2.67 18.6 101.0 2.80 \n",
|
||
"3 14.37 1.95 2.50 16.8 113.0 3.85 \n",
|
||
"4 13.24 2.59 2.87 21.0 118.0 2.80 \n",
|
||
"\n",
|
||
" flavanoids nonflavanoid_phenols proanthocyanins color_intensity hue \\\n",
|
||
"0 3.06 0.28 2.29 5.64 1.04 \n",
|
||
"1 2.76 0.26 1.28 4.38 1.05 \n",
|
||
"2 3.24 0.30 2.81 5.68 1.03 \n",
|
||
"3 3.49 0.24 2.18 7.80 0.86 \n",
|
||
"4 2.69 0.39 1.82 4.32 1.04 \n",
|
||
"\n",
|
||
" od280/od315_of_diluted_wines proline \n",
|
||
"0 3.92 1065.0 \n",
|
||
"1 3.40 1050.0 \n",
|
||
"2 3.17 1185.0 \n",
|
||
"3 3.45 1480.0 \n",
|
||
"4 2.93 735.0 "
|
||
]
|
||
},
|
||
"execution_count": 46,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"X = pd.DataFrame(data.data, columns=data.feature_names)\n",
|
||
"X.head()"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"id": "917c33ed",
|
||
"metadata": {},
|
||
"source": [
|
||
"Выясните размер датафрейма X и установите, имеются ли в нем пропущенные значения."
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 47,
|
||
"id": "f66d1569",
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/plain": [
|
||
"(178, 13)"
|
||
]
|
||
},
|
||
"execution_count": 47,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"X.shape"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 48,
|
||
"id": "4a1379f8",
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"<class 'pandas.core.frame.DataFrame'>\n",
|
||
"RangeIndex: 178 entries, 0 to 177\n",
|
||
"Data columns (total 13 columns):\n",
|
||
" # Column Non-Null Count Dtype \n",
|
||
"--- ------ -------------- ----- \n",
|
||
" 0 alcohol 178 non-null float64\n",
|
||
" 1 malic_acid 178 non-null float64\n",
|
||
" 2 ash 178 non-null float64\n",
|
||
" 3 alcalinity_of_ash 178 non-null float64\n",
|
||
" 4 magnesium 178 non-null float64\n",
|
||
" 5 total_phenols 178 non-null float64\n",
|
||
" 6 flavanoids 178 non-null float64\n",
|
||
" 7 nonflavanoid_phenols 178 non-null float64\n",
|
||
" 8 proanthocyanins 178 non-null float64\n",
|
||
" 9 color_intensity 178 non-null float64\n",
|
||
" 10 hue 178 non-null float64\n",
|
||
" 11 od280/od315_of_diluted_wines 178 non-null float64\n",
|
||
" 12 proline 178 non-null float64\n",
|
||
"dtypes: float64(13)\n",
|
||
"memory usage: 18.2 KB\n"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"X.info()"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 49,
|
||
"id": "f5573521",
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/plain": [
|
||
"alcohol 0\n",
|
||
"malic_acid 0\n",
|
||
"ash 0\n",
|
||
"alcalinity_of_ash 0\n",
|
||
"magnesium 0\n",
|
||
"total_phenols 0\n",
|
||
"flavanoids 0\n",
|
||
"nonflavanoid_phenols 0\n",
|
||
"proanthocyanins 0\n",
|
||
"color_intensity 0\n",
|
||
"hue 0\n",
|
||
"od280/od315_of_diluted_wines 0\n",
|
||
"proline 0\n",
|
||
"dtype: int64"
|
||
]
|
||
},
|
||
"execution_count": 49,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"X.isnull().astype(\"int\").sum()"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"id": "7fcfb081",
|
||
"metadata": {},
|
||
"source": [
|
||
"Добавьте в датафрейм поле с классами вин в виде чисел, имеющих тип данных numpy.int64. Название поля - 'target'."
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 50,
|
||
"id": "89d0aa13",
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"<class 'pandas.core.frame.DataFrame'>\n",
|
||
"RangeIndex: 178 entries, 0 to 177\n",
|
||
"Data columns (total 14 columns):\n",
|
||
" # Column Non-Null Count Dtype \n",
|
||
"--- ------ -------------- ----- \n",
|
||
" 0 alcohol 178 non-null float64\n",
|
||
" 1 malic_acid 178 non-null float64\n",
|
||
" 2 ash 178 non-null float64\n",
|
||
" 3 alcalinity_of_ash 178 non-null float64\n",
|
||
" 4 magnesium 178 non-null float64\n",
|
||
" 5 total_phenols 178 non-null float64\n",
|
||
" 6 flavanoids 178 non-null float64\n",
|
||
" 7 nonflavanoid_phenols 178 non-null float64\n",
|
||
" 8 proanthocyanins 178 non-null float64\n",
|
||
" 9 color_intensity 178 non-null float64\n",
|
||
" 10 hue 178 non-null float64\n",
|
||
" 11 od280/od315_of_diluted_wines 178 non-null float64\n",
|
||
" 12 proline 178 non-null float64\n",
|
||
" 13 target 178 non-null int64 \n",
|
||
"dtypes: float64(13), int64(1)\n",
|
||
"memory usage: 19.6 KB\n"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"X[\"target\"]=data[\"target\"].astype(np.int64)\n",
|
||
"X.info()"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 51,
|
||
"id": "50bcdef6",
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/html": [
|
||
"<div>\n",
|
||
"<style scoped>\n",
|
||
" .dataframe tbody tr th:only-of-type {\n",
|
||
" vertical-align: middle;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe tbody tr th {\n",
|
||
" vertical-align: top;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe thead th {\n",
|
||
" text-align: right;\n",
|
||
" }\n",
|
||
"</style>\n",
|
||
"<table border=\"1\" class=\"dataframe\">\n",
|
||
" <thead>\n",
|
||
" <tr style=\"text-align: right;\">\n",
|
||
" <th></th>\n",
|
||
" <th>alcohol</th>\n",
|
||
" <th>malic_acid</th>\n",
|
||
" <th>ash</th>\n",
|
||
" <th>alcalinity_of_ash</th>\n",
|
||
" <th>magnesium</th>\n",
|
||
" <th>total_phenols</th>\n",
|
||
" <th>flavanoids</th>\n",
|
||
" <th>nonflavanoid_phenols</th>\n",
|
||
" <th>proanthocyanins</th>\n",
|
||
" <th>color_intensity</th>\n",
|
||
" <th>hue</th>\n",
|
||
" <th>od280/od315_of_diluted_wines</th>\n",
|
||
" <th>proline</th>\n",
|
||
" <th>target</th>\n",
|
||
" </tr>\n",
|
||
" </thead>\n",
|
||
" <tbody>\n",
|
||
" <tr>\n",
|
||
" <th>0</th>\n",
|
||
" <td>14.23</td>\n",
|
||
" <td>1.71</td>\n",
|
||
" <td>2.43</td>\n",
|
||
" <td>15.6</td>\n",
|
||
" <td>127.0</td>\n",
|
||
" <td>2.80</td>\n",
|
||
" <td>3.06</td>\n",
|
||
" <td>0.28</td>\n",
|
||
" <td>2.29</td>\n",
|
||
" <td>5.64</td>\n",
|
||
" <td>1.04</td>\n",
|
||
" <td>3.92</td>\n",
|
||
" <td>1065.0</td>\n",
|
||
" <td>0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>1</th>\n",
|
||
" <td>13.20</td>\n",
|
||
" <td>1.78</td>\n",
|
||
" <td>2.14</td>\n",
|
||
" <td>11.2</td>\n",
|
||
" <td>100.0</td>\n",
|
||
" <td>2.65</td>\n",
|
||
" <td>2.76</td>\n",
|
||
" <td>0.26</td>\n",
|
||
" <td>1.28</td>\n",
|
||
" <td>4.38</td>\n",
|
||
" <td>1.05</td>\n",
|
||
" <td>3.40</td>\n",
|
||
" <td>1050.0</td>\n",
|
||
" <td>0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>2</th>\n",
|
||
" <td>13.16</td>\n",
|
||
" <td>2.36</td>\n",
|
||
" <td>2.67</td>\n",
|
||
" <td>18.6</td>\n",
|
||
" <td>101.0</td>\n",
|
||
" <td>2.80</td>\n",
|
||
" <td>3.24</td>\n",
|
||
" <td>0.30</td>\n",
|
||
" <td>2.81</td>\n",
|
||
" <td>5.68</td>\n",
|
||
" <td>1.03</td>\n",
|
||
" <td>3.17</td>\n",
|
||
" <td>1185.0</td>\n",
|
||
" <td>0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>3</th>\n",
|
||
" <td>14.37</td>\n",
|
||
" <td>1.95</td>\n",
|
||
" <td>2.50</td>\n",
|
||
" <td>16.8</td>\n",
|
||
" <td>113.0</td>\n",
|
||
" <td>3.85</td>\n",
|
||
" <td>3.49</td>\n",
|
||
" <td>0.24</td>\n",
|
||
" <td>2.18</td>\n",
|
||
" <td>7.80</td>\n",
|
||
" <td>0.86</td>\n",
|
||
" <td>3.45</td>\n",
|
||
" <td>1480.0</td>\n",
|
||
" <td>0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>4</th>\n",
|
||
" <td>13.24</td>\n",
|
||
" <td>2.59</td>\n",
|
||
" <td>2.87</td>\n",
|
||
" <td>21.0</td>\n",
|
||
" <td>118.0</td>\n",
|
||
" <td>2.80</td>\n",
|
||
" <td>2.69</td>\n",
|
||
" <td>0.39</td>\n",
|
||
" <td>1.82</td>\n",
|
||
" <td>4.32</td>\n",
|
||
" <td>1.04</td>\n",
|
||
" <td>2.93</td>\n",
|
||
" <td>735.0</td>\n",
|
||
" <td>0</td>\n",
|
||
" </tr>\n",
|
||
" </tbody>\n",
|
||
"</table>\n",
|
||
"</div>"
|
||
],
|
||
"text/plain": [
|
||
" alcohol malic_acid ash alcalinity_of_ash magnesium total_phenols \\\n",
|
||
"0 14.23 1.71 2.43 15.6 127.0 2.80 \n",
|
||
"1 13.20 1.78 2.14 11.2 100.0 2.65 \n",
|
||
"2 13.16 2.36 2.67 18.6 101.0 2.80 \n",
|
||
"3 14.37 1.95 2.50 16.8 113.0 3.85 \n",
|
||
"4 13.24 2.59 2.87 21.0 118.0 2.80 \n",
|
||
"\n",
|
||
" flavanoids nonflavanoid_phenols proanthocyanins color_intensity hue \\\n",
|
||
"0 3.06 0.28 2.29 5.64 1.04 \n",
|
||
"1 2.76 0.26 1.28 4.38 1.05 \n",
|
||
"2 3.24 0.30 2.81 5.68 1.03 \n",
|
||
"3 3.49 0.24 2.18 7.80 0.86 \n",
|
||
"4 2.69 0.39 1.82 4.32 1.04 \n",
|
||
"\n",
|
||
" od280/od315_of_diluted_wines proline target \n",
|
||
"0 3.92 1065.0 0 \n",
|
||
"1 3.40 1050.0 0 \n",
|
||
"2 3.17 1185.0 0 \n",
|
||
"3 3.45 1480.0 0 \n",
|
||
"4 2.93 735.0 0 "
|
||
]
|
||
},
|
||
"execution_count": 51,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"X.head()"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"id": "704ea79c",
|
||
"metadata": {},
|
||
"source": [
|
||
"Постройте матрицу корреляций для всех полей X. Дайте полученному датафрейму название X_corr."
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 52,
|
||
"id": "41d5c34c",
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/html": [
|
||
"<div>\n",
|
||
"<style scoped>\n",
|
||
" .dataframe tbody tr th:only-of-type {\n",
|
||
" vertical-align: middle;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe tbody tr th {\n",
|
||
" vertical-align: top;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe thead th {\n",
|
||
" text-align: right;\n",
|
||
" }\n",
|
||
"</style>\n",
|
||
"<table border=\"1\" class=\"dataframe\">\n",
|
||
" <thead>\n",
|
||
" <tr style=\"text-align: right;\">\n",
|
||
" <th></th>\n",
|
||
" <th>alcohol</th>\n",
|
||
" <th>malic_acid</th>\n",
|
||
" <th>ash</th>\n",
|
||
" <th>alcalinity_of_ash</th>\n",
|
||
" <th>magnesium</th>\n",
|
||
" <th>total_phenols</th>\n",
|
||
" <th>flavanoids</th>\n",
|
||
" <th>nonflavanoid_phenols</th>\n",
|
||
" <th>proanthocyanins</th>\n",
|
||
" <th>color_intensity</th>\n",
|
||
" <th>hue</th>\n",
|
||
" <th>od280/od315_of_diluted_wines</th>\n",
|
||
" <th>proline</th>\n",
|
||
" <th>target</th>\n",
|
||
" </tr>\n",
|
||
" </thead>\n",
|
||
" <tbody>\n",
|
||
" <tr>\n",
|
||
" <th>alcohol</th>\n",
|
||
" <td>1.000000</td>\n",
|
||
" <td>0.094397</td>\n",
|
||
" <td>0.211545</td>\n",
|
||
" <td>-0.310235</td>\n",
|
||
" <td>0.270798</td>\n",
|
||
" <td>0.289101</td>\n",
|
||
" <td>0.236815</td>\n",
|
||
" <td>-0.155929</td>\n",
|
||
" <td>0.136698</td>\n",
|
||
" <td>0.546364</td>\n",
|
||
" <td>-0.071747</td>\n",
|
||
" <td>0.072343</td>\n",
|
||
" <td>0.643720</td>\n",
|
||
" <td>-0.328222</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>malic_acid</th>\n",
|
||
" <td>0.094397</td>\n",
|
||
" <td>1.000000</td>\n",
|
||
" <td>0.164045</td>\n",
|
||
" <td>0.288500</td>\n",
|
||
" <td>-0.054575</td>\n",
|
||
" <td>-0.335167</td>\n",
|
||
" <td>-0.411007</td>\n",
|
||
" <td>0.292977</td>\n",
|
||
" <td>-0.220746</td>\n",
|
||
" <td>0.248985</td>\n",
|
||
" <td>-0.561296</td>\n",
|
||
" <td>-0.368710</td>\n",
|
||
" <td>-0.192011</td>\n",
|
||
" <td>0.437776</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>ash</th>\n",
|
||
" <td>0.211545</td>\n",
|
||
" <td>0.164045</td>\n",
|
||
" <td>1.000000</td>\n",
|
||
" <td>0.443367</td>\n",
|
||
" <td>0.286587</td>\n",
|
||
" <td>0.128980</td>\n",
|
||
" <td>0.115077</td>\n",
|
||
" <td>0.186230</td>\n",
|
||
" <td>0.009652</td>\n",
|
||
" <td>0.258887</td>\n",
|
||
" <td>-0.074667</td>\n",
|
||
" <td>0.003911</td>\n",
|
||
" <td>0.223626</td>\n",
|
||
" <td>-0.049643</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>alcalinity_of_ash</th>\n",
|
||
" <td>-0.310235</td>\n",
|
||
" <td>0.288500</td>\n",
|
||
" <td>0.443367</td>\n",
|
||
" <td>1.000000</td>\n",
|
||
" <td>-0.083333</td>\n",
|
||
" <td>-0.321113</td>\n",
|
||
" <td>-0.351370</td>\n",
|
||
" <td>0.361922</td>\n",
|
||
" <td>-0.197327</td>\n",
|
||
" <td>0.018732</td>\n",
|
||
" <td>-0.273955</td>\n",
|
||
" <td>-0.276769</td>\n",
|
||
" <td>-0.440597</td>\n",
|
||
" <td>0.517859</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>magnesium</th>\n",
|
||
" <td>0.270798</td>\n",
|
||
" <td>-0.054575</td>\n",
|
||
" <td>0.286587</td>\n",
|
||
" <td>-0.083333</td>\n",
|
||
" <td>1.000000</td>\n",
|
||
" <td>0.214401</td>\n",
|
||
" <td>0.195784</td>\n",
|
||
" <td>-0.256294</td>\n",
|
||
" <td>0.236441</td>\n",
|
||
" <td>0.199950</td>\n",
|
||
" <td>0.055398</td>\n",
|
||
" <td>0.066004</td>\n",
|
||
" <td>0.393351</td>\n",
|
||
" <td>-0.209179</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>total_phenols</th>\n",
|
||
" <td>0.289101</td>\n",
|
||
" <td>-0.335167</td>\n",
|
||
" <td>0.128980</td>\n",
|
||
" <td>-0.321113</td>\n",
|
||
" <td>0.214401</td>\n",
|
||
" <td>1.000000</td>\n",
|
||
" <td>0.864564</td>\n",
|
||
" <td>-0.449935</td>\n",
|
||
" <td>0.612413</td>\n",
|
||
" <td>-0.055136</td>\n",
|
||
" <td>0.433681</td>\n",
|
||
" <td>0.699949</td>\n",
|
||
" <td>0.498115</td>\n",
|
||
" <td>-0.719163</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>flavanoids</th>\n",
|
||
" <td>0.236815</td>\n",
|
||
" <td>-0.411007</td>\n",
|
||
" <td>0.115077</td>\n",
|
||
" <td>-0.351370</td>\n",
|
||
" <td>0.195784</td>\n",
|
||
" <td>0.864564</td>\n",
|
||
" <td>1.000000</td>\n",
|
||
" <td>-0.537900</td>\n",
|
||
" <td>0.652692</td>\n",
|
||
" <td>-0.172379</td>\n",
|
||
" <td>0.543479</td>\n",
|
||
" <td>0.787194</td>\n",
|
||
" <td>0.494193</td>\n",
|
||
" <td>-0.847498</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>nonflavanoid_phenols</th>\n",
|
||
" <td>-0.155929</td>\n",
|
||
" <td>0.292977</td>\n",
|
||
" <td>0.186230</td>\n",
|
||
" <td>0.361922</td>\n",
|
||
" <td>-0.256294</td>\n",
|
||
" <td>-0.449935</td>\n",
|
||
" <td>-0.537900</td>\n",
|
||
" <td>1.000000</td>\n",
|
||
" <td>-0.365845</td>\n",
|
||
" <td>0.139057</td>\n",
|
||
" <td>-0.262640</td>\n",
|
||
" <td>-0.503270</td>\n",
|
||
" <td>-0.311385</td>\n",
|
||
" <td>0.489109</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>proanthocyanins</th>\n",
|
||
" <td>0.136698</td>\n",
|
||
" <td>-0.220746</td>\n",
|
||
" <td>0.009652</td>\n",
|
||
" <td>-0.197327</td>\n",
|
||
" <td>0.236441</td>\n",
|
||
" <td>0.612413</td>\n",
|
||
" <td>0.652692</td>\n",
|
||
" <td>-0.365845</td>\n",
|
||
" <td>1.000000</td>\n",
|
||
" <td>-0.025250</td>\n",
|
||
" <td>0.295544</td>\n",
|
||
" <td>0.519067</td>\n",
|
||
" <td>0.330417</td>\n",
|
||
" <td>-0.499130</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>color_intensity</th>\n",
|
||
" <td>0.546364</td>\n",
|
||
" <td>0.248985</td>\n",
|
||
" <td>0.258887</td>\n",
|
||
" <td>0.018732</td>\n",
|
||
" <td>0.199950</td>\n",
|
||
" <td>-0.055136</td>\n",
|
||
" <td>-0.172379</td>\n",
|
||
" <td>0.139057</td>\n",
|
||
" <td>-0.025250</td>\n",
|
||
" <td>1.000000</td>\n",
|
||
" <td>-0.521813</td>\n",
|
||
" <td>-0.428815</td>\n",
|
||
" <td>0.316100</td>\n",
|
||
" <td>0.265668</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>hue</th>\n",
|
||
" <td>-0.071747</td>\n",
|
||
" <td>-0.561296</td>\n",
|
||
" <td>-0.074667</td>\n",
|
||
" <td>-0.273955</td>\n",
|
||
" <td>0.055398</td>\n",
|
||
" <td>0.433681</td>\n",
|
||
" <td>0.543479</td>\n",
|
||
" <td>-0.262640</td>\n",
|
||
" <td>0.295544</td>\n",
|
||
" <td>-0.521813</td>\n",
|
||
" <td>1.000000</td>\n",
|
||
" <td>0.565468</td>\n",
|
||
" <td>0.236183</td>\n",
|
||
" <td>-0.617369</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>od280/od315_of_diluted_wines</th>\n",
|
||
" <td>0.072343</td>\n",
|
||
" <td>-0.368710</td>\n",
|
||
" <td>0.003911</td>\n",
|
||
" <td>-0.276769</td>\n",
|
||
" <td>0.066004</td>\n",
|
||
" <td>0.699949</td>\n",
|
||
" <td>0.787194</td>\n",
|
||
" <td>-0.503270</td>\n",
|
||
" <td>0.519067</td>\n",
|
||
" <td>-0.428815</td>\n",
|
||
" <td>0.565468</td>\n",
|
||
" <td>1.000000</td>\n",
|
||
" <td>0.312761</td>\n",
|
||
" <td>-0.788230</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>proline</th>\n",
|
||
" <td>0.643720</td>\n",
|
||
" <td>-0.192011</td>\n",
|
||
" <td>0.223626</td>\n",
|
||
" <td>-0.440597</td>\n",
|
||
" <td>0.393351</td>\n",
|
||
" <td>0.498115</td>\n",
|
||
" <td>0.494193</td>\n",
|
||
" <td>-0.311385</td>\n",
|
||
" <td>0.330417</td>\n",
|
||
" <td>0.316100</td>\n",
|
||
" <td>0.236183</td>\n",
|
||
" <td>0.312761</td>\n",
|
||
" <td>1.000000</td>\n",
|
||
" <td>-0.633717</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>target</th>\n",
|
||
" <td>-0.328222</td>\n",
|
||
" <td>0.437776</td>\n",
|
||
" <td>-0.049643</td>\n",
|
||
" <td>0.517859</td>\n",
|
||
" <td>-0.209179</td>\n",
|
||
" <td>-0.719163</td>\n",
|
||
" <td>-0.847498</td>\n",
|
||
" <td>0.489109</td>\n",
|
||
" <td>-0.499130</td>\n",
|
||
" <td>0.265668</td>\n",
|
||
" <td>-0.617369</td>\n",
|
||
" <td>-0.788230</td>\n",
|
||
" <td>-0.633717</td>\n",
|
||
" <td>1.000000</td>\n",
|
||
" </tr>\n",
|
||
" </tbody>\n",
|
||
"</table>\n",
|
||
"</div>"
|
||
],
|
||
"text/plain": [
|
||
" alcohol malic_acid ash \\\n",
|
||
"alcohol 1.000000 0.094397 0.211545 \n",
|
||
"malic_acid 0.094397 1.000000 0.164045 \n",
|
||
"ash 0.211545 0.164045 1.000000 \n",
|
||
"alcalinity_of_ash -0.310235 0.288500 0.443367 \n",
|
||
"magnesium 0.270798 -0.054575 0.286587 \n",
|
||
"total_phenols 0.289101 -0.335167 0.128980 \n",
|
||
"flavanoids 0.236815 -0.411007 0.115077 \n",
|
||
"nonflavanoid_phenols -0.155929 0.292977 0.186230 \n",
|
||
"proanthocyanins 0.136698 -0.220746 0.009652 \n",
|
||
"color_intensity 0.546364 0.248985 0.258887 \n",
|
||
"hue -0.071747 -0.561296 -0.074667 \n",
|
||
"od280/od315_of_diluted_wines 0.072343 -0.368710 0.003911 \n",
|
||
"proline 0.643720 -0.192011 0.223626 \n",
|
||
"target -0.328222 0.437776 -0.049643 \n",
|
||
"\n",
|
||
" alcalinity_of_ash magnesium total_phenols \\\n",
|
||
"alcohol -0.310235 0.270798 0.289101 \n",
|
||
"malic_acid 0.288500 -0.054575 -0.335167 \n",
|
||
"ash 0.443367 0.286587 0.128980 \n",
|
||
"alcalinity_of_ash 1.000000 -0.083333 -0.321113 \n",
|
||
"magnesium -0.083333 1.000000 0.214401 \n",
|
||
"total_phenols -0.321113 0.214401 1.000000 \n",
|
||
"flavanoids -0.351370 0.195784 0.864564 \n",
|
||
"nonflavanoid_phenols 0.361922 -0.256294 -0.449935 \n",
|
||
"proanthocyanins -0.197327 0.236441 0.612413 \n",
|
||
"color_intensity 0.018732 0.199950 -0.055136 \n",
|
||
"hue -0.273955 0.055398 0.433681 \n",
|
||
"od280/od315_of_diluted_wines -0.276769 0.066004 0.699949 \n",
|
||
"proline -0.440597 0.393351 0.498115 \n",
|
||
"target 0.517859 -0.209179 -0.719163 \n",
|
||
"\n",
|
||
" flavanoids nonflavanoid_phenols \\\n",
|
||
"alcohol 0.236815 -0.155929 \n",
|
||
"malic_acid -0.411007 0.292977 \n",
|
||
"ash 0.115077 0.186230 \n",
|
||
"alcalinity_of_ash -0.351370 0.361922 \n",
|
||
"magnesium 0.195784 -0.256294 \n",
|
||
"total_phenols 0.864564 -0.449935 \n",
|
||
"flavanoids 1.000000 -0.537900 \n",
|
||
"nonflavanoid_phenols -0.537900 1.000000 \n",
|
||
"proanthocyanins 0.652692 -0.365845 \n",
|
||
"color_intensity -0.172379 0.139057 \n",
|
||
"hue 0.543479 -0.262640 \n",
|
||
"od280/od315_of_diluted_wines 0.787194 -0.503270 \n",
|
||
"proline 0.494193 -0.311385 \n",
|
||
"target -0.847498 0.489109 \n",
|
||
"\n",
|
||
" proanthocyanins color_intensity hue \\\n",
|
||
"alcohol 0.136698 0.546364 -0.071747 \n",
|
||
"malic_acid -0.220746 0.248985 -0.561296 \n",
|
||
"ash 0.009652 0.258887 -0.074667 \n",
|
||
"alcalinity_of_ash -0.197327 0.018732 -0.273955 \n",
|
||
"magnesium 0.236441 0.199950 0.055398 \n",
|
||
"total_phenols 0.612413 -0.055136 0.433681 \n",
|
||
"flavanoids 0.652692 -0.172379 0.543479 \n",
|
||
"nonflavanoid_phenols -0.365845 0.139057 -0.262640 \n",
|
||
"proanthocyanins 1.000000 -0.025250 0.295544 \n",
|
||
"color_intensity -0.025250 1.000000 -0.521813 \n",
|
||
"hue 0.295544 -0.521813 1.000000 \n",
|
||
"od280/od315_of_diluted_wines 0.519067 -0.428815 0.565468 \n",
|
||
"proline 0.330417 0.316100 0.236183 \n",
|
||
"target -0.499130 0.265668 -0.617369 \n",
|
||
"\n",
|
||
" od280/od315_of_diluted_wines proline target \n",
|
||
"alcohol 0.072343 0.643720 -0.328222 \n",
|
||
"malic_acid -0.368710 -0.192011 0.437776 \n",
|
||
"ash 0.003911 0.223626 -0.049643 \n",
|
||
"alcalinity_of_ash -0.276769 -0.440597 0.517859 \n",
|
||
"magnesium 0.066004 0.393351 -0.209179 \n",
|
||
"total_phenols 0.699949 0.498115 -0.719163 \n",
|
||
"flavanoids 0.787194 0.494193 -0.847498 \n",
|
||
"nonflavanoid_phenols -0.503270 -0.311385 0.489109 \n",
|
||
"proanthocyanins 0.519067 0.330417 -0.499130 \n",
|
||
"color_intensity -0.428815 0.316100 0.265668 \n",
|
||
"hue 0.565468 0.236183 -0.617369 \n",
|
||
"od280/od315_of_diluted_wines 1.000000 0.312761 -0.788230 \n",
|
||
"proline 0.312761 1.000000 -0.633717 \n",
|
||
"target -0.788230 -0.633717 1.000000 "
|
||
]
|
||
},
|
||
"execution_count": 52,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"X_corr = X.corr()\n",
|
||
"X_corr"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"id": "1ed7c122",
|
||
"metadata": {},
|
||
"source": [
|
||
"Создайте список high_corr из признаков, корреляция которых с полем target по абсолютному значению превышает 0.5 (причем, само поле target не должно входить в этот список)."
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 53,
|
||
"id": "6edf6763",
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/plain": [
|
||
"Index(['alcalinity_of_ash', 'total_phenols', 'flavanoids', 'hue',\n",
|
||
" 'od280/od315_of_diluted_wines', 'proline'],\n",
|
||
" dtype='object')"
|
||
]
|
||
},
|
||
"execution_count": 53,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"high_corr = X_corr.loc[(X_corr.index != 'target') & (abs(X_corr['target']) > .5), X_corr.columns != 'target'].index\n",
|
||
"high_corr"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"id": "f0ff52e6",
|
||
"metadata": {},
|
||
"source": [
|
||
"Удалите из датафрейма X поле с целевой переменной. Для всех признаков, названия которых содержатся в списке high_corr, вычислите квадрат их значений и добавьте в датафрейм X соответствующие поля с суффиксом '_2', добавленного к первоначальному названию признака. Итоговый датафрейм должен содержать все поля, которые, были в нем изначально, а также поля с признаками из списка high_corr, возведенными в квадрат. Выведите описание полей датафрейма X с помощью метода describe."
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 54,
|
||
"id": "1e1403ec",
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/html": [
|
||
"<div>\n",
|
||
"<style scoped>\n",
|
||
" .dataframe tbody tr th:only-of-type {\n",
|
||
" vertical-align: middle;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe tbody tr th {\n",
|
||
" vertical-align: top;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe thead th {\n",
|
||
" text-align: right;\n",
|
||
" }\n",
|
||
"</style>\n",
|
||
"<table border=\"1\" class=\"dataframe\">\n",
|
||
" <thead>\n",
|
||
" <tr style=\"text-align: right;\">\n",
|
||
" <th></th>\n",
|
||
" <th>alcohol</th>\n",
|
||
" <th>malic_acid</th>\n",
|
||
" <th>ash</th>\n",
|
||
" <th>alcalinity_of_ash</th>\n",
|
||
" <th>magnesium</th>\n",
|
||
" <th>total_phenols</th>\n",
|
||
" <th>flavanoids</th>\n",
|
||
" <th>nonflavanoid_phenols</th>\n",
|
||
" <th>proanthocyanins</th>\n",
|
||
" <th>color_intensity</th>\n",
|
||
" <th>hue</th>\n",
|
||
" <th>od280/od315_of_diluted_wines</th>\n",
|
||
" <th>proline</th>\n",
|
||
" </tr>\n",
|
||
" </thead>\n",
|
||
" <tbody>\n",
|
||
" <tr>\n",
|
||
" <th>0</th>\n",
|
||
" <td>14.23</td>\n",
|
||
" <td>1.71</td>\n",
|
||
" <td>2.43</td>\n",
|
||
" <td>15.6</td>\n",
|
||
" <td>127.0</td>\n",
|
||
" <td>2.80</td>\n",
|
||
" <td>3.06</td>\n",
|
||
" <td>0.28</td>\n",
|
||
" <td>2.29</td>\n",
|
||
" <td>5.64</td>\n",
|
||
" <td>1.04</td>\n",
|
||
" <td>3.92</td>\n",
|
||
" <td>1065.0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>1</th>\n",
|
||
" <td>13.20</td>\n",
|
||
" <td>1.78</td>\n",
|
||
" <td>2.14</td>\n",
|
||
" <td>11.2</td>\n",
|
||
" <td>100.0</td>\n",
|
||
" <td>2.65</td>\n",
|
||
" <td>2.76</td>\n",
|
||
" <td>0.26</td>\n",
|
||
" <td>1.28</td>\n",
|
||
" <td>4.38</td>\n",
|
||
" <td>1.05</td>\n",
|
||
" <td>3.40</td>\n",
|
||
" <td>1050.0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>2</th>\n",
|
||
" <td>13.16</td>\n",
|
||
" <td>2.36</td>\n",
|
||
" <td>2.67</td>\n",
|
||
" <td>18.6</td>\n",
|
||
" <td>101.0</td>\n",
|
||
" <td>2.80</td>\n",
|
||
" <td>3.24</td>\n",
|
||
" <td>0.30</td>\n",
|
||
" <td>2.81</td>\n",
|
||
" <td>5.68</td>\n",
|
||
" <td>1.03</td>\n",
|
||
" <td>3.17</td>\n",
|
||
" <td>1185.0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>3</th>\n",
|
||
" <td>14.37</td>\n",
|
||
" <td>1.95</td>\n",
|
||
" <td>2.50</td>\n",
|
||
" <td>16.8</td>\n",
|
||
" <td>113.0</td>\n",
|
||
" <td>3.85</td>\n",
|
||
" <td>3.49</td>\n",
|
||
" <td>0.24</td>\n",
|
||
" <td>2.18</td>\n",
|
||
" <td>7.80</td>\n",
|
||
" <td>0.86</td>\n",
|
||
" <td>3.45</td>\n",
|
||
" <td>1480.0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>4</th>\n",
|
||
" <td>13.24</td>\n",
|
||
" <td>2.59</td>\n",
|
||
" <td>2.87</td>\n",
|
||
" <td>21.0</td>\n",
|
||
" <td>118.0</td>\n",
|
||
" <td>2.80</td>\n",
|
||
" <td>2.69</td>\n",
|
||
" <td>0.39</td>\n",
|
||
" <td>1.82</td>\n",
|
||
" <td>4.32</td>\n",
|
||
" <td>1.04</td>\n",
|
||
" <td>2.93</td>\n",
|
||
" <td>735.0</td>\n",
|
||
" </tr>\n",
|
||
" </tbody>\n",
|
||
"</table>\n",
|
||
"</div>"
|
||
],
|
||
"text/plain": [
|
||
" alcohol malic_acid ash alcalinity_of_ash magnesium total_phenols \\\n",
|
||
"0 14.23 1.71 2.43 15.6 127.0 2.80 \n",
|
||
"1 13.20 1.78 2.14 11.2 100.0 2.65 \n",
|
||
"2 13.16 2.36 2.67 18.6 101.0 2.80 \n",
|
||
"3 14.37 1.95 2.50 16.8 113.0 3.85 \n",
|
||
"4 13.24 2.59 2.87 21.0 118.0 2.80 \n",
|
||
"\n",
|
||
" flavanoids nonflavanoid_phenols proanthocyanins color_intensity hue \\\n",
|
||
"0 3.06 0.28 2.29 5.64 1.04 \n",
|
||
"1 2.76 0.26 1.28 4.38 1.05 \n",
|
||
"2 3.24 0.30 2.81 5.68 1.03 \n",
|
||
"3 3.49 0.24 2.18 7.80 0.86 \n",
|
||
"4 2.69 0.39 1.82 4.32 1.04 \n",
|
||
"\n",
|
||
" od280/od315_of_diluted_wines proline \n",
|
||
"0 3.92 1065.0 \n",
|
||
"1 3.40 1050.0 \n",
|
||
"2 3.17 1185.0 \n",
|
||
"3 3.45 1480.0 \n",
|
||
"4 2.93 735.0 "
|
||
]
|
||
},
|
||
"execution_count": 54,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"X = X.drop('target', axis=1)\n",
|
||
"X.head()"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 56,
|
||
"id": "74173e8d",
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/html": [
|
||
"<div>\n",
|
||
"<style scoped>\n",
|
||
" .dataframe tbody tr th:only-of-type {\n",
|
||
" vertical-align: middle;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe tbody tr th {\n",
|
||
" vertical-align: top;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe thead th {\n",
|
||
" text-align: right;\n",
|
||
" }\n",
|
||
"</style>\n",
|
||
"<table border=\"1\" class=\"dataframe\">\n",
|
||
" <thead>\n",
|
||
" <tr style=\"text-align: right;\">\n",
|
||
" <th></th>\n",
|
||
" <th>alcohol</th>\n",
|
||
" <th>malic_acid</th>\n",
|
||
" <th>ash</th>\n",
|
||
" <th>alcalinity_of_ash</th>\n",
|
||
" <th>magnesium</th>\n",
|
||
" <th>total_phenols</th>\n",
|
||
" <th>flavanoids</th>\n",
|
||
" <th>nonflavanoid_phenols</th>\n",
|
||
" <th>proanthocyanins</th>\n",
|
||
" <th>color_intensity</th>\n",
|
||
" <th>hue</th>\n",
|
||
" <th>od280/od315_of_diluted_wines</th>\n",
|
||
" <th>proline</th>\n",
|
||
" <th>alcalinity_of_ash_2</th>\n",
|
||
" <th>total_phenols_2</th>\n",
|
||
" <th>flavanoids_2</th>\n",
|
||
" <th>hue_2</th>\n",
|
||
" <th>od280/od315_of_diluted_wines_2</th>\n",
|
||
" <th>proline_2</th>\n",
|
||
" </tr>\n",
|
||
" </thead>\n",
|
||
" <tbody>\n",
|
||
" <tr>\n",
|
||
" <th>0</th>\n",
|
||
" <td>14.23</td>\n",
|
||
" <td>1.71</td>\n",
|
||
" <td>2.43</td>\n",
|
||
" <td>15.6</td>\n",
|
||
" <td>127.0</td>\n",
|
||
" <td>2.80</td>\n",
|
||
" <td>3.06</td>\n",
|
||
" <td>0.28</td>\n",
|
||
" <td>2.29</td>\n",
|
||
" <td>5.64</td>\n",
|
||
" <td>1.04</td>\n",
|
||
" <td>3.92</td>\n",
|
||
" <td>1065.0</td>\n",
|
||
" <td>243.36</td>\n",
|
||
" <td>7.8400</td>\n",
|
||
" <td>9.3636</td>\n",
|
||
" <td>1.0816</td>\n",
|
||
" <td>15.3664</td>\n",
|
||
" <td>1134225.0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>1</th>\n",
|
||
" <td>13.20</td>\n",
|
||
" <td>1.78</td>\n",
|
||
" <td>2.14</td>\n",
|
||
" <td>11.2</td>\n",
|
||
" <td>100.0</td>\n",
|
||
" <td>2.65</td>\n",
|
||
" <td>2.76</td>\n",
|
||
" <td>0.26</td>\n",
|
||
" <td>1.28</td>\n",
|
||
" <td>4.38</td>\n",
|
||
" <td>1.05</td>\n",
|
||
" <td>3.40</td>\n",
|
||
" <td>1050.0</td>\n",
|
||
" <td>125.44</td>\n",
|
||
" <td>7.0225</td>\n",
|
||
" <td>7.6176</td>\n",
|
||
" <td>1.1025</td>\n",
|
||
" <td>11.5600</td>\n",
|
||
" <td>1102500.0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>2</th>\n",
|
||
" <td>13.16</td>\n",
|
||
" <td>2.36</td>\n",
|
||
" <td>2.67</td>\n",
|
||
" <td>18.6</td>\n",
|
||
" <td>101.0</td>\n",
|
||
" <td>2.80</td>\n",
|
||
" <td>3.24</td>\n",
|
||
" <td>0.30</td>\n",
|
||
" <td>2.81</td>\n",
|
||
" <td>5.68</td>\n",
|
||
" <td>1.03</td>\n",
|
||
" <td>3.17</td>\n",
|
||
" <td>1185.0</td>\n",
|
||
" <td>345.96</td>\n",
|
||
" <td>7.8400</td>\n",
|
||
" <td>10.4976</td>\n",
|
||
" <td>1.0609</td>\n",
|
||
" <td>10.0489</td>\n",
|
||
" <td>1404225.0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>3</th>\n",
|
||
" <td>14.37</td>\n",
|
||
" <td>1.95</td>\n",
|
||
" <td>2.50</td>\n",
|
||
" <td>16.8</td>\n",
|
||
" <td>113.0</td>\n",
|
||
" <td>3.85</td>\n",
|
||
" <td>3.49</td>\n",
|
||
" <td>0.24</td>\n",
|
||
" <td>2.18</td>\n",
|
||
" <td>7.80</td>\n",
|
||
" <td>0.86</td>\n",
|
||
" <td>3.45</td>\n",
|
||
" <td>1480.0</td>\n",
|
||
" <td>282.24</td>\n",
|
||
" <td>14.8225</td>\n",
|
||
" <td>12.1801</td>\n",
|
||
" <td>0.7396</td>\n",
|
||
" <td>11.9025</td>\n",
|
||
" <td>2190400.0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>4</th>\n",
|
||
" <td>13.24</td>\n",
|
||
" <td>2.59</td>\n",
|
||
" <td>2.87</td>\n",
|
||
" <td>21.0</td>\n",
|
||
" <td>118.0</td>\n",
|
||
" <td>2.80</td>\n",
|
||
" <td>2.69</td>\n",
|
||
" <td>0.39</td>\n",
|
||
" <td>1.82</td>\n",
|
||
" <td>4.32</td>\n",
|
||
" <td>1.04</td>\n",
|
||
" <td>2.93</td>\n",
|
||
" <td>735.0</td>\n",
|
||
" <td>441.00</td>\n",
|
||
" <td>7.8400</td>\n",
|
||
" <td>7.2361</td>\n",
|
||
" <td>1.0816</td>\n",
|
||
" <td>8.5849</td>\n",
|
||
" <td>540225.0</td>\n",
|
||
" </tr>\n",
|
||
" </tbody>\n",
|
||
"</table>\n",
|
||
"</div>"
|
||
],
|
||
"text/plain": [
|
||
" alcohol malic_acid ash alcalinity_of_ash magnesium total_phenols \\\n",
|
||
"0 14.23 1.71 2.43 15.6 127.0 2.80 \n",
|
||
"1 13.20 1.78 2.14 11.2 100.0 2.65 \n",
|
||
"2 13.16 2.36 2.67 18.6 101.0 2.80 \n",
|
||
"3 14.37 1.95 2.50 16.8 113.0 3.85 \n",
|
||
"4 13.24 2.59 2.87 21.0 118.0 2.80 \n",
|
||
"\n",
|
||
" flavanoids nonflavanoid_phenols proanthocyanins color_intensity hue \\\n",
|
||
"0 3.06 0.28 2.29 5.64 1.04 \n",
|
||
"1 2.76 0.26 1.28 4.38 1.05 \n",
|
||
"2 3.24 0.30 2.81 5.68 1.03 \n",
|
||
"3 3.49 0.24 2.18 7.80 0.86 \n",
|
||
"4 2.69 0.39 1.82 4.32 1.04 \n",
|
||
"\n",
|
||
" od280/od315_of_diluted_wines proline alcalinity_of_ash_2 \\\n",
|
||
"0 3.92 1065.0 243.36 \n",
|
||
"1 3.40 1050.0 125.44 \n",
|
||
"2 3.17 1185.0 345.96 \n",
|
||
"3 3.45 1480.0 282.24 \n",
|
||
"4 2.93 735.0 441.00 \n",
|
||
"\n",
|
||
" total_phenols_2 flavanoids_2 hue_2 od280/od315_of_diluted_wines_2 \\\n",
|
||
"0 7.8400 9.3636 1.0816 15.3664 \n",
|
||
"1 7.0225 7.6176 1.1025 11.5600 \n",
|
||
"2 7.8400 10.4976 1.0609 10.0489 \n",
|
||
"3 14.8225 12.1801 0.7396 11.9025 \n",
|
||
"4 7.8400 7.2361 1.0816 8.5849 \n",
|
||
"\n",
|
||
" proline_2 \n",
|
||
"0 1134225.0 \n",
|
||
"1 1102500.0 \n",
|
||
"2 1404225.0 \n",
|
||
"3 2190400.0 \n",
|
||
"4 540225.0 "
|
||
]
|
||
},
|
||
"execution_count": 56,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"for feature_name in high_corr:\n",
|
||
" X['{0}_2'.format(feature_name)] = X[feature_name] ** 2\n",
|
||
"\n",
|
||
"X.head()"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 57,
|
||
"id": "190f74c0",
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/html": [
|
||
"<div>\n",
|
||
"<style scoped>\n",
|
||
" .dataframe tbody tr th:only-of-type {\n",
|
||
" vertical-align: middle;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe tbody tr th {\n",
|
||
" vertical-align: top;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe thead th {\n",
|
||
" text-align: right;\n",
|
||
" }\n",
|
||
"</style>\n",
|
||
"<table border=\"1\" class=\"dataframe\">\n",
|
||
" <thead>\n",
|
||
" <tr style=\"text-align: right;\">\n",
|
||
" <th></th>\n",
|
||
" <th>alcohol</th>\n",
|
||
" <th>malic_acid</th>\n",
|
||
" <th>ash</th>\n",
|
||
" <th>alcalinity_of_ash</th>\n",
|
||
" <th>magnesium</th>\n",
|
||
" <th>total_phenols</th>\n",
|
||
" <th>flavanoids</th>\n",
|
||
" <th>nonflavanoid_phenols</th>\n",
|
||
" <th>proanthocyanins</th>\n",
|
||
" <th>color_intensity</th>\n",
|
||
" <th>hue</th>\n",
|
||
" <th>od280/od315_of_diluted_wines</th>\n",
|
||
" <th>proline</th>\n",
|
||
" <th>alcalinity_of_ash_2</th>\n",
|
||
" <th>total_phenols_2</th>\n",
|
||
" <th>flavanoids_2</th>\n",
|
||
" <th>hue_2</th>\n",
|
||
" <th>od280/od315_of_diluted_wines_2</th>\n",
|
||
" <th>proline_2</th>\n",
|
||
" </tr>\n",
|
||
" </thead>\n",
|
||
" <tbody>\n",
|
||
" <tr>\n",
|
||
" <th>count</th>\n",
|
||
" <td>178.000000</td>\n",
|
||
" <td>178.000000</td>\n",
|
||
" <td>178.000000</td>\n",
|
||
" <td>178.000000</td>\n",
|
||
" <td>178.000000</td>\n",
|
||
" <td>178.000000</td>\n",
|
||
" <td>178.000000</td>\n",
|
||
" <td>178.000000</td>\n",
|
||
" <td>178.000000</td>\n",
|
||
" <td>178.000000</td>\n",
|
||
" <td>178.000000</td>\n",
|
||
" <td>178.000000</td>\n",
|
||
" <td>178.000000</td>\n",
|
||
" <td>178.000000</td>\n",
|
||
" <td>178.000000</td>\n",
|
||
" <td>178.000000</td>\n",
|
||
" <td>178.000000</td>\n",
|
||
" <td>178.000000</td>\n",
|
||
" <td>1.780000e+02</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>mean</th>\n",
|
||
" <td>13.000618</td>\n",
|
||
" <td>2.336348</td>\n",
|
||
" <td>2.366517</td>\n",
|
||
" <td>19.494944</td>\n",
|
||
" <td>99.741573</td>\n",
|
||
" <td>2.295112</td>\n",
|
||
" <td>2.029270</td>\n",
|
||
" <td>0.361854</td>\n",
|
||
" <td>1.590899</td>\n",
|
||
" <td>5.058090</td>\n",
|
||
" <td>0.957449</td>\n",
|
||
" <td>2.611685</td>\n",
|
||
" <td>746.893258</td>\n",
|
||
" <td>391.142865</td>\n",
|
||
" <td>5.657030</td>\n",
|
||
" <td>5.110049</td>\n",
|
||
" <td>0.968661</td>\n",
|
||
" <td>7.322155</td>\n",
|
||
" <td>6.564591e+05</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>std</th>\n",
|
||
" <td>0.811827</td>\n",
|
||
" <td>1.117146</td>\n",
|
||
" <td>0.274344</td>\n",
|
||
" <td>3.339564</td>\n",
|
||
" <td>14.282484</td>\n",
|
||
" <td>0.625851</td>\n",
|
||
" <td>0.998859</td>\n",
|
||
" <td>0.124453</td>\n",
|
||
" <td>0.572359</td>\n",
|
||
" <td>2.318286</td>\n",
|
||
" <td>0.228572</td>\n",
|
||
" <td>0.709990</td>\n",
|
||
" <td>314.907474</td>\n",
|
||
" <td>133.671775</td>\n",
|
||
" <td>2.936294</td>\n",
|
||
" <td>4.211441</td>\n",
|
||
" <td>0.443798</td>\n",
|
||
" <td>3.584316</td>\n",
|
||
" <td>5.558591e+05</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>min</th>\n",
|
||
" <td>11.030000</td>\n",
|
||
" <td>0.740000</td>\n",
|
||
" <td>1.360000</td>\n",
|
||
" <td>10.600000</td>\n",
|
||
" <td>70.000000</td>\n",
|
||
" <td>0.980000</td>\n",
|
||
" <td>0.340000</td>\n",
|
||
" <td>0.130000</td>\n",
|
||
" <td>0.410000</td>\n",
|
||
" <td>1.280000</td>\n",
|
||
" <td>0.480000</td>\n",
|
||
" <td>1.270000</td>\n",
|
||
" <td>278.000000</td>\n",
|
||
" <td>112.360000</td>\n",
|
||
" <td>0.960400</td>\n",
|
||
" <td>0.115600</td>\n",
|
||
" <td>0.230400</td>\n",
|
||
" <td>1.612900</td>\n",
|
||
" <td>7.728400e+04</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>25%</th>\n",
|
||
" <td>12.362500</td>\n",
|
||
" <td>1.602500</td>\n",
|
||
" <td>2.210000</td>\n",
|
||
" <td>17.200000</td>\n",
|
||
" <td>88.000000</td>\n",
|
||
" <td>1.742500</td>\n",
|
||
" <td>1.205000</td>\n",
|
||
" <td>0.270000</td>\n",
|
||
" <td>1.250000</td>\n",
|
||
" <td>3.220000</td>\n",
|
||
" <td>0.782500</td>\n",
|
||
" <td>1.937500</td>\n",
|
||
" <td>500.500000</td>\n",
|
||
" <td>295.840000</td>\n",
|
||
" <td>3.036325</td>\n",
|
||
" <td>1.452100</td>\n",
|
||
" <td>0.612325</td>\n",
|
||
" <td>3.754075</td>\n",
|
||
" <td>2.505010e+05</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>50%</th>\n",
|
||
" <td>13.050000</td>\n",
|
||
" <td>1.865000</td>\n",
|
||
" <td>2.360000</td>\n",
|
||
" <td>19.500000</td>\n",
|
||
" <td>98.000000</td>\n",
|
||
" <td>2.355000</td>\n",
|
||
" <td>2.135000</td>\n",
|
||
" <td>0.340000</td>\n",
|
||
" <td>1.555000</td>\n",
|
||
" <td>4.690000</td>\n",
|
||
" <td>0.965000</td>\n",
|
||
" <td>2.780000</td>\n",
|
||
" <td>673.500000</td>\n",
|
||
" <td>380.250000</td>\n",
|
||
" <td>5.546050</td>\n",
|
||
" <td>4.558250</td>\n",
|
||
" <td>0.931250</td>\n",
|
||
" <td>7.728400</td>\n",
|
||
" <td>4.536045e+05</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>75%</th>\n",
|
||
" <td>13.677500</td>\n",
|
||
" <td>3.082500</td>\n",
|
||
" <td>2.557500</td>\n",
|
||
" <td>21.500000</td>\n",
|
||
" <td>107.000000</td>\n",
|
||
" <td>2.800000</td>\n",
|
||
" <td>2.875000</td>\n",
|
||
" <td>0.437500</td>\n",
|
||
" <td>1.950000</td>\n",
|
||
" <td>6.200000</td>\n",
|
||
" <td>1.120000</td>\n",
|
||
" <td>3.170000</td>\n",
|
||
" <td>985.000000</td>\n",
|
||
" <td>462.250000</td>\n",
|
||
" <td>7.840000</td>\n",
|
||
" <td>8.265700</td>\n",
|
||
" <td>1.254400</td>\n",
|
||
" <td>10.048900</td>\n",
|
||
" <td>9.702250e+05</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>max</th>\n",
|
||
" <td>14.830000</td>\n",
|
||
" <td>5.800000</td>\n",
|
||
" <td>3.230000</td>\n",
|
||
" <td>30.000000</td>\n",
|
||
" <td>162.000000</td>\n",
|
||
" <td>3.880000</td>\n",
|
||
" <td>5.080000</td>\n",
|
||
" <td>0.660000</td>\n",
|
||
" <td>3.580000</td>\n",
|
||
" <td>13.000000</td>\n",
|
||
" <td>1.710000</td>\n",
|
||
" <td>4.000000</td>\n",
|
||
" <td>1680.000000</td>\n",
|
||
" <td>900.000000</td>\n",
|
||
" <td>15.054400</td>\n",
|
||
" <td>25.806400</td>\n",
|
||
" <td>2.924100</td>\n",
|
||
" <td>16.000000</td>\n",
|
||
" <td>2.822400e+06</td>\n",
|
||
" </tr>\n",
|
||
" </tbody>\n",
|
||
"</table>\n",
|
||
"</div>"
|
||
],
|
||
"text/plain": [
|
||
" alcohol malic_acid ash alcalinity_of_ash magnesium \\\n",
|
||
"count 178.000000 178.000000 178.000000 178.000000 178.000000 \n",
|
||
"mean 13.000618 2.336348 2.366517 19.494944 99.741573 \n",
|
||
"std 0.811827 1.117146 0.274344 3.339564 14.282484 \n",
|
||
"min 11.030000 0.740000 1.360000 10.600000 70.000000 \n",
|
||
"25% 12.362500 1.602500 2.210000 17.200000 88.000000 \n",
|
||
"50% 13.050000 1.865000 2.360000 19.500000 98.000000 \n",
|
||
"75% 13.677500 3.082500 2.557500 21.500000 107.000000 \n",
|
||
"max 14.830000 5.800000 3.230000 30.000000 162.000000 \n",
|
||
"\n",
|
||
" total_phenols flavanoids nonflavanoid_phenols proanthocyanins \\\n",
|
||
"count 178.000000 178.000000 178.000000 178.000000 \n",
|
||
"mean 2.295112 2.029270 0.361854 1.590899 \n",
|
||
"std 0.625851 0.998859 0.124453 0.572359 \n",
|
||
"min 0.980000 0.340000 0.130000 0.410000 \n",
|
||
"25% 1.742500 1.205000 0.270000 1.250000 \n",
|
||
"50% 2.355000 2.135000 0.340000 1.555000 \n",
|
||
"75% 2.800000 2.875000 0.437500 1.950000 \n",
|
||
"max 3.880000 5.080000 0.660000 3.580000 \n",
|
||
"\n",
|
||
" color_intensity hue od280/od315_of_diluted_wines proline \\\n",
|
||
"count 178.000000 178.000000 178.000000 178.000000 \n",
|
||
"mean 5.058090 0.957449 2.611685 746.893258 \n",
|
||
"std 2.318286 0.228572 0.709990 314.907474 \n",
|
||
"min 1.280000 0.480000 1.270000 278.000000 \n",
|
||
"25% 3.220000 0.782500 1.937500 500.500000 \n",
|
||
"50% 4.690000 0.965000 2.780000 673.500000 \n",
|
||
"75% 6.200000 1.120000 3.170000 985.000000 \n",
|
||
"max 13.000000 1.710000 4.000000 1680.000000 \n",
|
||
"\n",
|
||
" alcalinity_of_ash_2 total_phenols_2 flavanoids_2 hue_2 \\\n",
|
||
"count 178.000000 178.000000 178.000000 178.000000 \n",
|
||
"mean 391.142865 5.657030 5.110049 0.968661 \n",
|
||
"std 133.671775 2.936294 4.211441 0.443798 \n",
|
||
"min 112.360000 0.960400 0.115600 0.230400 \n",
|
||
"25% 295.840000 3.036325 1.452100 0.612325 \n",
|
||
"50% 380.250000 5.546050 4.558250 0.931250 \n",
|
||
"75% 462.250000 7.840000 8.265700 1.254400 \n",
|
||
"max 900.000000 15.054400 25.806400 2.924100 \n",
|
||
"\n",
|
||
" od280/od315_of_diluted_wines_2 proline_2 \n",
|
||
"count 178.000000 1.780000e+02 \n",
|
||
"mean 7.322155 6.564591e+05 \n",
|
||
"std 3.584316 5.558591e+05 \n",
|
||
"min 1.612900 7.728400e+04 \n",
|
||
"25% 3.754075 2.505010e+05 \n",
|
||
"50% 7.728400 4.536045e+05 \n",
|
||
"75% 10.048900 9.702250e+05 \n",
|
||
"max 16.000000 2.822400e+06 "
|
||
]
|
||
},
|
||
"execution_count": 57,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"X.describe()"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"id": "97f44af7",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": []
|
||
}
|
||
],
|
||
"metadata": {
|
||
"kernelspec": {
|
||
"display_name": "Python 3 (ipykernel)",
|
||
"language": "python",
|
||
"name": "python3"
|
||
},
|
||
"language_info": {
|
||
"codemirror_mode": {
|
||
"name": "ipython",
|
||
"version": 3
|
||
},
|
||
"file_extension": ".py",
|
||
"mimetype": "text/x-python",
|
||
"name": "python",
|
||
"nbconvert_exporter": "python",
|
||
"pygments_lexer": "ipython3",
|
||
"version": "3.9.12"
|
||
}
|
||
},
|
||
"nbformat": 4,
|
||
"nbformat_minor": 5
|
||
}
|