From 930c4e6ee3fe3dd282530c89c5aee36d3233935e Mon Sep 17 00:00:00 2001
From: denis-on <98041803+denis-on@users.noreply.github.com>
Date: Wed, 30 Nov 2022 12:26:44 +0300
Subject: [PATCH] DZ les 6
---
dz_les_6.ipynb | 3982 ++++++++++++++++++++++++++++++++++++++++++++++++
1 file changed, 3982 insertions(+)
create mode 100644 dz_les_6.ipynb
diff --git a/dz_les_6.ipynb b/dz_les_6.ipynb
new file mode 100644
index 0000000..eaf78ce
--- /dev/null
+++ b/dz_les_6.ipynb
@@ -0,0 +1,3982 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "id": "e4f5991e",
+ "metadata": {},
+ "source": [
+ "# Тема “Обучение с учителем”"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "2442aab9",
+ "metadata": {},
+ "source": [
+ "## Задание 1\n",
+ "Импортируйте библиотеки pandas и numpy.\n",
+ "\n",
+ "Загрузите \"Boston House Prices dataset\" из встроенных наборов данных библиотеки sklearn. Создайте датафреймы X и y из этих данных.\n",
+ "\n",
+ "Разбейте эти датафреймы на тренировочные (X_train, y_train) и тестовые (X_test, y_test) с помощью функции train_test_split так, чтобы размер тестовой выборки составлял 30% от всех данных, при этом аргумент random_state должен быть равен 42.\n",
+ "\n",
+ "Создайте модель линейной регрессии под названием lr с помощью класса LinearRegression из модуля sklearn.linear_model.\n",
+ "\n",
+ "Обучите модель на тренировочных данных (используйте все признаки) и сделайте предсказание на тестовых.\n",
+ "\n",
+ "Вычислите R2 полученных предказаний с помощью r2_score из модуля sklearn.metrics."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 1,
+ "id": "f79ac751",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import pandas as pd\n",
+ "import numpy as np"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 2,
+ "id": "483c687f",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "from sklearn.datasets import load_boston"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 3,
+ "id": "fd8d693f",
+ "metadata": {
+ "scrolled": true
+ },
+ "outputs": [],
+ "source": [
+ "import warnings\n",
+ "from sklearn.datasets import load_boston\n",
+ "with warnings.catch_warnings():\n",
+ " # You should probably not use this dataset.\n",
+ " warnings.filterwarnings(\"ignore\")\n",
+ " boston = load_boston()\n",
+ "data = boston[\"data\"]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 4,
+ "id": "81339e3e",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "
\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " CRIM | \n",
+ " ZN | \n",
+ " INDUS | \n",
+ " CHAS | \n",
+ " NOX | \n",
+ " RM | \n",
+ " AGE | \n",
+ " DIS | \n",
+ " RAD | \n",
+ " TAX | \n",
+ " PTRATIO | \n",
+ " B | \n",
+ " LSTAT | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 0 | \n",
+ " 0.00632 | \n",
+ " 18.0 | \n",
+ " 2.31 | \n",
+ " 0.0 | \n",
+ " 0.538 | \n",
+ " 6.575 | \n",
+ " 65.2 | \n",
+ " 4.0900 | \n",
+ " 1.0 | \n",
+ " 296.0 | \n",
+ " 15.3 | \n",
+ " 396.90 | \n",
+ " 4.98 | \n",
+ "
\n",
+ " \n",
+ " | 1 | \n",
+ " 0.02731 | \n",
+ " 0.0 | \n",
+ " 7.07 | \n",
+ " 0.0 | \n",
+ " 0.469 | \n",
+ " 6.421 | \n",
+ " 78.9 | \n",
+ " 4.9671 | \n",
+ " 2.0 | \n",
+ " 242.0 | \n",
+ " 17.8 | \n",
+ " 396.90 | \n",
+ " 9.14 | \n",
+ "
\n",
+ " \n",
+ " | 2 | \n",
+ " 0.02729 | \n",
+ " 0.0 | \n",
+ " 7.07 | \n",
+ " 0.0 | \n",
+ " 0.469 | \n",
+ " 7.185 | \n",
+ " 61.1 | \n",
+ " 4.9671 | \n",
+ " 2.0 | \n",
+ " 242.0 | \n",
+ " 17.8 | \n",
+ " 392.83 | \n",
+ " 4.03 | \n",
+ "
\n",
+ " \n",
+ " | 3 | \n",
+ " 0.03237 | \n",
+ " 0.0 | \n",
+ " 2.18 | \n",
+ " 0.0 | \n",
+ " 0.458 | \n",
+ " 6.998 | \n",
+ " 45.8 | \n",
+ " 6.0622 | \n",
+ " 3.0 | \n",
+ " 222.0 | \n",
+ " 18.7 | \n",
+ " 394.63 | \n",
+ " 2.94 | \n",
+ "
\n",
+ " \n",
+ " | 4 | \n",
+ " 0.06905 | \n",
+ " 0.0 | \n",
+ " 2.18 | \n",
+ " 0.0 | \n",
+ " 0.458 | \n",
+ " 7.147 | \n",
+ " 54.2 | \n",
+ " 6.0622 | \n",
+ " 3.0 | \n",
+ " 222.0 | \n",
+ " 18.7 | \n",
+ " 396.90 | \n",
+ " 5.33 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " CRIM ZN INDUS CHAS NOX RM AGE DIS RAD TAX \\\n",
+ "0 0.00632 18.0 2.31 0.0 0.538 6.575 65.2 4.0900 1.0 296.0 \n",
+ "1 0.02731 0.0 7.07 0.0 0.469 6.421 78.9 4.9671 2.0 242.0 \n",
+ "2 0.02729 0.0 7.07 0.0 0.469 7.185 61.1 4.9671 2.0 242.0 \n",
+ "3 0.03237 0.0 2.18 0.0 0.458 6.998 45.8 6.0622 3.0 222.0 \n",
+ "4 0.06905 0.0 2.18 0.0 0.458 7.147 54.2 6.0622 3.0 222.0 \n",
+ "\n",
+ " PTRATIO B LSTAT \n",
+ "0 15.3 396.90 4.98 \n",
+ "1 17.8 396.90 9.14 \n",
+ "2 17.8 392.83 4.03 \n",
+ "3 18.7 394.63 2.94 \n",
+ "4 18.7 396.90 5.33 "
+ ]
+ },
+ "execution_count": 4,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "feature_names = boston[\"feature_names\"]\n",
+ "\n",
+ "X = pd.DataFrame(data, columns=feature_names)\n",
+ "X.head()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 5,
+ "id": "0a3b3fbd",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " price | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 0 | \n",
+ " 24.0 | \n",
+ "
\n",
+ " \n",
+ " | 1 | \n",
+ " 21.6 | \n",
+ "
\n",
+ " \n",
+ " | 2 | \n",
+ " 34.7 | \n",
+ "
\n",
+ " \n",
+ " | 3 | \n",
+ " 33.4 | \n",
+ "
\n",
+ " \n",
+ " | 4 | \n",
+ " 36.2 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " price\n",
+ "0 24.0\n",
+ "1 21.6\n",
+ "2 34.7\n",
+ "3 33.4\n",
+ "4 36.2"
+ ]
+ },
+ "execution_count": 5,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "target = boston[\"target\"]\n",
+ "\n",
+ "Y = pd.DataFrame(target, columns=[\"price\"])\n",
+ "Y.head()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 6,
+ "id": "81f5f72a",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "from sklearn.model_selection import train_test_split"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 7,
+ "id": "eca2e802",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.30, random_state=42)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 8,
+ "id": "b7fdd109",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "from sklearn.linear_model import LinearRegression"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 9,
+ "id": "87bbc227",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "lr = LinearRegression()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 10,
+ "id": "41af6442",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "LinearRegression()"
+ ]
+ },
+ "execution_count": 10,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "lr.fit(X_train, Y_train)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 11,
+ "id": "28a67c09",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " Y_test | \n",
+ " Y_pred_lr | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 173 | \n",
+ " 23.6 | \n",
+ " 28.648960 | \n",
+ "
\n",
+ " \n",
+ " | 274 | \n",
+ " 32.4 | \n",
+ " 36.495014 | \n",
+ "
\n",
+ " \n",
+ " | 491 | \n",
+ " 13.6 | \n",
+ " 15.411193 | \n",
+ "
\n",
+ " \n",
+ " | 72 | \n",
+ " 22.8 | \n",
+ " 25.403213 | \n",
+ "
\n",
+ " \n",
+ " | 452 | \n",
+ " 16.1 | \n",
+ " 18.855280 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " Y_test Y_pred_lr\n",
+ "173 23.6 28.648960\n",
+ "274 32.4 36.495014\n",
+ "491 13.6 15.411193\n",
+ "72 22.8 25.403213\n",
+ "452 16.1 18.855280"
+ ]
+ },
+ "execution_count": 11,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "y_pred_lr = lr.predict(X_test)\n",
+ "check_test_lr = pd.DataFrame({\n",
+ " \"Y_test\": Y_test[\"price\"], \n",
+ " \"Y_pred_lr\": y_pred_lr.flatten()})\n",
+ "\n",
+ "check_test_lr.head()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 12,
+ "id": "4a035a94",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "21.517444231176995\n"
+ ]
+ }
+ ],
+ "source": [
+ "from sklearn.metrics import mean_squared_error\n",
+ "\n",
+ "mean_squared_error_lr = mean_squared_error(check_test_lr[\"Y_pred_lr\"], check_test_lr[\"Y_test\"])\n",
+ "print(mean_squared_error_lr)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 13,
+ "id": "2b87195e",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "0.711226005748496"
+ ]
+ },
+ "execution_count": 13,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "from sklearn.metrics import r2_score\n",
+ "\n",
+ "r2_score(Y_test, y_pred_lr)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "bbf6dc90",
+ "metadata": {},
+ "source": [
+ "## Задание 2\n",
+ "\n",
+ "Создайте модель под названием model с помощью RandomForestRegressor из модуля sklearn.ensemble.\n",
+ "\n",
+ "Сделайте агрумент n_estimators равным 1000, max_depth должен быть равен 12 и random_state сделайте равным 42.\n",
+ "\n",
+ "Обучите модель на тренировочных данных аналогично тому, как вы обучали модель LinearRegression,\n",
+ "но при этом в метод fit вместо датафрейма y_train поставьте y_train.values[:, 0],\n",
+ "чтобы получить из датафрейма одномерный массив Numpy,\n",
+ "так как для класса RandomForestRegressor в данном методе для аргумента y предпочтительно применение массивов вместо датафрейма.\n",
+ "\n",
+ "Сделайте предсказание на тестовых данных и посчитайте R2.\n",
+ "\n",
+ "Сравните с результатом из предыдущего задания. Напишите в комментариях к коду, какая модель в данном случае работает лучше."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 14,
+ "id": "f8f381fd",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "RandomForestRegressor(max_depth=12, n_estimators=1000, random_state=42)"
+ ]
+ },
+ "execution_count": 14,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "from sklearn.ensemble import RandomForestRegressor\n",
+ "model = RandomForestRegressor(n_estimators=1000, max_depth=12, random_state=42)\n",
+ "model.fit(X_train, Y_train.values[:, 0])"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 15,
+ "id": "c2733e21",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "y_pred_1 = model.predict(X_test)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 16,
+ "id": "ff72edb4",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "0.87472606157312"
+ ]
+ },
+ "execution_count": 16,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "r2_score(Y_test, y_pred_1)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 17,
+ "id": "24d68924",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " y_test | \n",
+ " y_pred_lr | \n",
+ " y_pred_rf | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 173 | \n",
+ " 23.6 | \n",
+ " 28.648960 | \n",
+ " 22.806412 | \n",
+ "
\n",
+ " \n",
+ " | 274 | \n",
+ " 32.4 | \n",
+ " 36.495014 | \n",
+ " 31.131464 | \n",
+ "
\n",
+ " \n",
+ " | 491 | \n",
+ " 13.6 | \n",
+ " 15.411193 | \n",
+ " 16.339125 | \n",
+ "
\n",
+ " \n",
+ " | 72 | \n",
+ " 22.8 | \n",
+ " 25.403213 | \n",
+ " 23.810726 | \n",
+ "
\n",
+ " \n",
+ " | 452 | \n",
+ " 16.1 | \n",
+ " 18.855280 | \n",
+ " 17.139521 | \n",
+ "
\n",
+ " \n",
+ " | 76 | \n",
+ " 20.0 | \n",
+ " 23.146689 | \n",
+ " 21.832284 | \n",
+ "
\n",
+ " \n",
+ " | 316 | \n",
+ " 17.8 | \n",
+ " 17.392124 | \n",
+ " 19.895747 | \n",
+ "
\n",
+ " \n",
+ " | 140 | \n",
+ " 14.0 | \n",
+ " 14.078599 | \n",
+ " 14.754118 | \n",
+ "
\n",
+ " \n",
+ " | 471 | \n",
+ " 19.6 | \n",
+ " 23.036927 | \n",
+ " 21.240835 | \n",
+ "
\n",
+ " \n",
+ " | 500 | \n",
+ " 16.8 | \n",
+ " 20.599433 | \n",
+ " 20.898658 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " y_test y_pred_lr y_pred_rf\n",
+ "173 23.6 28.648960 22.806412\n",
+ "274 32.4 36.495014 31.131464\n",
+ "491 13.6 15.411193 16.339125\n",
+ "72 22.8 25.403213 23.810726\n",
+ "452 16.1 18.855280 17.139521\n",
+ "76 20.0 23.146689 21.832284\n",
+ "316 17.8 17.392124 19.895747\n",
+ "140 14.0 14.078599 14.754118\n",
+ "471 19.6 23.036927 21.240835\n",
+ "500 16.8 20.599433 20.898658"
+ ]
+ },
+ "execution_count": 17,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "check_test = pd.DataFrame({\n",
+ " \"y_test\": Y_test[\"price\"],\n",
+ " \"y_pred_lr\": y_pred_lr.flatten(),\n",
+ " \"y_pred_rf\": y_pred_1.flatten(),\n",
+ "})\n",
+ "\n",
+ "check_test.head(10)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "9f09783f",
+ "metadata": {},
+ "source": [
+ "R2 из первого задания меньше чем R2 во втором задании, а значить у модели построеной с помощью RandomForestRegressor предсказания ближе к тестовым."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "a86c9368",
+ "metadata": {},
+ "source": [
+ "## *Задание 3\n",
+ "Вызовите документацию для класса RandomForestRegressor, найдите информацию об атрибуте feature_importances_.\n",
+ "\n",
+ "С помощью этого атрибута найдите сумму всех показателей важности, установите, какие два признака показывают наибольшую важность."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 18,
+ "id": "8acc1978",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "?RandomForestRegressor"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 19,
+ "id": "cd674bb4",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "[0.03167574 0.00154252 0.00713813 0.00123624 0.01426897 0.40268179\n",
+ " 0.01429864 0.06397257 0.00528122 0.01152493 0.01808108 0.01245085\n",
+ " 0.41584732]\n"
+ ]
+ }
+ ],
+ "source": [
+ "print(model.feature_importances_)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 20,
+ "id": "1e1dbef5",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " feature_importance | \n",
+ " name | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 0 | \n",
+ " 0.031676 | \n",
+ " CRIM | \n",
+ "
\n",
+ " \n",
+ " | 1 | \n",
+ " 0.001543 | \n",
+ " ZN | \n",
+ "
\n",
+ " \n",
+ " | 2 | \n",
+ " 0.007138 | \n",
+ " INDUS | \n",
+ "
\n",
+ " \n",
+ " | 3 | \n",
+ " 0.001236 | \n",
+ " CHAS | \n",
+ "
\n",
+ " \n",
+ " | 4 | \n",
+ " 0.014269 | \n",
+ " NOX | \n",
+ "
\n",
+ " \n",
+ " | 5 | \n",
+ " 0.402682 | \n",
+ " RM | \n",
+ "
\n",
+ " \n",
+ " | 6 | \n",
+ " 0.014299 | \n",
+ " AGE | \n",
+ "
\n",
+ " \n",
+ " | 7 | \n",
+ " 0.063973 | \n",
+ " DIS | \n",
+ "
\n",
+ " \n",
+ " | 8 | \n",
+ " 0.005281 | \n",
+ " RAD | \n",
+ "
\n",
+ " \n",
+ " | 9 | \n",
+ " 0.011525 | \n",
+ " TAX | \n",
+ "
\n",
+ " \n",
+ " | 10 | \n",
+ " 0.018081 | \n",
+ " PTRATIO | \n",
+ "
\n",
+ " \n",
+ " | 11 | \n",
+ " 0.012451 | \n",
+ " B | \n",
+ "
\n",
+ " \n",
+ " | 12 | \n",
+ " 0.415847 | \n",
+ " LSTAT | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " feature_importance name\n",
+ "0 0.031676 CRIM\n",
+ "1 0.001543 ZN\n",
+ "2 0.007138 INDUS\n",
+ "3 0.001236 CHAS\n",
+ "4 0.014269 NOX\n",
+ "5 0.402682 RM\n",
+ "6 0.014299 AGE\n",
+ "7 0.063973 DIS\n",
+ "8 0.005281 RAD\n",
+ "9 0.011525 TAX\n",
+ "10 0.018081 PTRATIO\n",
+ "11 0.012451 B\n",
+ "12 0.415847 LSTAT"
+ ]
+ },
+ "execution_count": 20,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "feature_importance = pd.DataFrame({'name':X.columns, \n",
+ " 'feature_importance':model.feature_importances_}, \n",
+ " columns=['feature_importance', 'name'])\n",
+ "feature_importance"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "3bf5bd9e",
+ "metadata": {},
+ "source": [
+ "Два признака показываюoие наибольшую важность:"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 21,
+ "id": "82439470",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " feature_importance | \n",
+ " name | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 12 | \n",
+ " 0.415847 | \n",
+ " LSTAT | \n",
+ "
\n",
+ " \n",
+ " | 5 | \n",
+ " 0.402682 | \n",
+ " RM | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " feature_importance name\n",
+ "12 0.415847 LSTAT\n",
+ "5 0.402682 RM"
+ ]
+ },
+ "execution_count": 21,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "feature_importance.nlargest(2, 'feature_importance')"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "9c507bb6",
+ "metadata": {},
+ "source": [
+ "Сумма показателей важности:"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 22,
+ "id": "2aae8d49",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "1.0"
+ ]
+ },
+ "execution_count": 22,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "model.feature_importances_.sum()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "3e9baeab",
+ "metadata": {},
+ "source": [
+ "## *Задание 4\n",
+ "\n",
+ "В этом задании мы будем работать с датасетом, с которым мы уже знакомы по домашнему заданию по библиотеке Matplotlib, это датасет Credit Card Fraud Detection.Для этого датасета мы будем решать задачу классификации - будем определять,какие из транзакциции по кредитной карте являются мошенническими.Данный датасет сильно несбалансирован (так как случаи мошенничества относительно редки),так что применение метрики accuracy не принесет пользы и не поможет выбрать лучшую модель.Мы будем вычислять AUC, то есть площадь под кривой ROC.\n",
+ "\n",
+ "Импортируйте из соответствующих модулей RandomForestClassifier, GridSearchCV и train_test_split.\n",
+ "\n",
+ "Загрузите датасет creditcard.csv и создайте датафрейм df.\n",
+ "\n",
+ "С помощью метода value_counts с аргументом normalize=True убедитесь в том, что выборка несбалансирована.Используя метод info, проверьте, все ли столбцы содержат числовые данные и нет ли в них пропусков. Примените следующую настройку, чтобы можно было просматривать все столбцы датафрейма: pd.options.display.max_columns = 100.\n",
+ "\n",
+ "Просмотрите первые 10 строк датафрейма df.\n",
+ "\n",
+ "Создайте датафрейм X из датафрейма df, исключив столбец Class.\n",
+ "\n",
+ "Создайте объект Series под названием y из столбца Class.\n",
+ "\n",
+ "Разбейте X и y на тренировочный и тестовый наборы данных при помощи функции train_test_split, используя аргументы: test_size=0.3, random_state=100, stratify=y. У вас должны получиться объекты X_train, X_test, y_train и y_test.\n",
+ "\n",
+ "Просмотрите информацию о их форме. Для поиска по сетке параметров задайте такие параметры: parameters = [{'n_estimators': [10, 15],'max_features': np.arange(3, 5),'max_depth': np.arange(4, 7)}]\n",
+ "\n",
+ "Создайте модель GridSearchCV со следующими аргументами: estimator=RandomForestClassifier(random_state=100), param_grid=parameters, scoring='roc_auc', cv=3.\n",
+ "\n",
+ "Обучите модель на тренировочном наборе данных (может занять несколько минут).\n",
+ "\n",
+ "Просмотрите параметры лучшей модели с помощью атрибута best_params_.\n",
+ "\n",
+ "Предскажите вероятности классов с помощью полученнной модели и метода predict_proba.\n",
+ "\n",
+ "Из полученного результата (массив Numpy) выберите столбец с индексом 1 (вероятность класса 1) и запишите в массив y_pred_proba.\n",
+ "\n",
+ "Из модуля sklearn.metrics импортируйте метрику roc_auc_score.\n",
+ "\n",
+ "Вычислите AUC на тестовых данных и сравните с результатом,полученным на тренировочных данных, используя в качестве аргументовмассивы y_test и y_pred_proba."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 23,
+ "id": "c890d00a",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " Time | \n",
+ " V1 | \n",
+ " V2 | \n",
+ " V3 | \n",
+ " V4 | \n",
+ " V5 | \n",
+ " V6 | \n",
+ " V7 | \n",
+ " V8 | \n",
+ " V9 | \n",
+ " ... | \n",
+ " V21 | \n",
+ " V22 | \n",
+ " V23 | \n",
+ " V24 | \n",
+ " V25 | \n",
+ " V26 | \n",
+ " V27 | \n",
+ " V28 | \n",
+ " Amount | \n",
+ " Class | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 0 | \n",
+ " 0.0 | \n",
+ " -1.359807 | \n",
+ " -0.072781 | \n",
+ " 2.536347 | \n",
+ " 1.378155 | \n",
+ " -0.338321 | \n",
+ " 0.462388 | \n",
+ " 0.239599 | \n",
+ " 0.098698 | \n",
+ " 0.363787 | \n",
+ " ... | \n",
+ " -0.018307 | \n",
+ " 0.277838 | \n",
+ " -0.110474 | \n",
+ " 0.066928 | \n",
+ " 0.128539 | \n",
+ " -0.189115 | \n",
+ " 0.133558 | \n",
+ " -0.021053 | \n",
+ " 149.62 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " | 1 | \n",
+ " 0.0 | \n",
+ " 1.191857 | \n",
+ " 0.266151 | \n",
+ " 0.166480 | \n",
+ " 0.448154 | \n",
+ " 0.060018 | \n",
+ " -0.082361 | \n",
+ " -0.078803 | \n",
+ " 0.085102 | \n",
+ " -0.255425 | \n",
+ " ... | \n",
+ " -0.225775 | \n",
+ " -0.638672 | \n",
+ " 0.101288 | \n",
+ " -0.339846 | \n",
+ " 0.167170 | \n",
+ " 0.125895 | \n",
+ " -0.008983 | \n",
+ " 0.014724 | \n",
+ " 2.69 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " | 2 | \n",
+ " 1.0 | \n",
+ " -1.358354 | \n",
+ " -1.340163 | \n",
+ " 1.773209 | \n",
+ " 0.379780 | \n",
+ " -0.503198 | \n",
+ " 1.800499 | \n",
+ " 0.791461 | \n",
+ " 0.247676 | \n",
+ " -1.514654 | \n",
+ " ... | \n",
+ " 0.247998 | \n",
+ " 0.771679 | \n",
+ " 0.909412 | \n",
+ " -0.689281 | \n",
+ " -0.327642 | \n",
+ " -0.139097 | \n",
+ " -0.055353 | \n",
+ " -0.059752 | \n",
+ " 378.66 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " | 3 | \n",
+ " 1.0 | \n",
+ " -0.966272 | \n",
+ " -0.185226 | \n",
+ " 1.792993 | \n",
+ " -0.863291 | \n",
+ " -0.010309 | \n",
+ " 1.247203 | \n",
+ " 0.237609 | \n",
+ " 0.377436 | \n",
+ " -1.387024 | \n",
+ " ... | \n",
+ " -0.108300 | \n",
+ " 0.005274 | \n",
+ " -0.190321 | \n",
+ " -1.175575 | \n",
+ " 0.647376 | \n",
+ " -0.221929 | \n",
+ " 0.062723 | \n",
+ " 0.061458 | \n",
+ " 123.50 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " | 4 | \n",
+ " 2.0 | \n",
+ " -1.158233 | \n",
+ " 0.877737 | \n",
+ " 1.548718 | \n",
+ " 0.403034 | \n",
+ " -0.407193 | \n",
+ " 0.095921 | \n",
+ " 0.592941 | \n",
+ " -0.270533 | \n",
+ " 0.817739 | \n",
+ " ... | \n",
+ " -0.009431 | \n",
+ " 0.798278 | \n",
+ " -0.137458 | \n",
+ " 0.141267 | \n",
+ " -0.206010 | \n",
+ " 0.502292 | \n",
+ " 0.219422 | \n",
+ " 0.215153 | \n",
+ " 69.99 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " | 5 | \n",
+ " 2.0 | \n",
+ " -0.425966 | \n",
+ " 0.960523 | \n",
+ " 1.141109 | \n",
+ " -0.168252 | \n",
+ " 0.420987 | \n",
+ " -0.029728 | \n",
+ " 0.476201 | \n",
+ " 0.260314 | \n",
+ " -0.568671 | \n",
+ " ... | \n",
+ " -0.208254 | \n",
+ " -0.559825 | \n",
+ " -0.026398 | \n",
+ " -0.371427 | \n",
+ " -0.232794 | \n",
+ " 0.105915 | \n",
+ " 0.253844 | \n",
+ " 0.081080 | \n",
+ " 3.67 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " | 6 | \n",
+ " 4.0 | \n",
+ " 1.229658 | \n",
+ " 0.141004 | \n",
+ " 0.045371 | \n",
+ " 1.202613 | \n",
+ " 0.191881 | \n",
+ " 0.272708 | \n",
+ " -0.005159 | \n",
+ " 0.081213 | \n",
+ " 0.464960 | \n",
+ " ... | \n",
+ " -0.167716 | \n",
+ " -0.270710 | \n",
+ " -0.154104 | \n",
+ " -0.780055 | \n",
+ " 0.750137 | \n",
+ " -0.257237 | \n",
+ " 0.034507 | \n",
+ " 0.005168 | \n",
+ " 4.99 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " | 7 | \n",
+ " 7.0 | \n",
+ " -0.644269 | \n",
+ " 1.417964 | \n",
+ " 1.074380 | \n",
+ " -0.492199 | \n",
+ " 0.948934 | \n",
+ " 0.428118 | \n",
+ " 1.120631 | \n",
+ " -3.807864 | \n",
+ " 0.615375 | \n",
+ " ... | \n",
+ " 1.943465 | \n",
+ " -1.015455 | \n",
+ " 0.057504 | \n",
+ " -0.649709 | \n",
+ " -0.415267 | \n",
+ " -0.051634 | \n",
+ " -1.206921 | \n",
+ " -1.085339 | \n",
+ " 40.80 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " | 8 | \n",
+ " 7.0 | \n",
+ " -0.894286 | \n",
+ " 0.286157 | \n",
+ " -0.113192 | \n",
+ " -0.271526 | \n",
+ " 2.669599 | \n",
+ " 3.721818 | \n",
+ " 0.370145 | \n",
+ " 0.851084 | \n",
+ " -0.392048 | \n",
+ " ... | \n",
+ " -0.073425 | \n",
+ " -0.268092 | \n",
+ " -0.204233 | \n",
+ " 1.011592 | \n",
+ " 0.373205 | \n",
+ " -0.384157 | \n",
+ " 0.011747 | \n",
+ " 0.142404 | \n",
+ " 93.20 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " | 9 | \n",
+ " 9.0 | \n",
+ " -0.338262 | \n",
+ " 1.119593 | \n",
+ " 1.044367 | \n",
+ " -0.222187 | \n",
+ " 0.499361 | \n",
+ " -0.246761 | \n",
+ " 0.651583 | \n",
+ " 0.069539 | \n",
+ " -0.736727 | \n",
+ " ... | \n",
+ " -0.246914 | \n",
+ " -0.633753 | \n",
+ " -0.120794 | \n",
+ " -0.385050 | \n",
+ " -0.069733 | \n",
+ " 0.094199 | \n",
+ " 0.246219 | \n",
+ " 0.083076 | \n",
+ " 3.68 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
10 rows × 31 columns
\n",
+ "
"
+ ],
+ "text/plain": [
+ " Time V1 V2 V3 V4 V5 V6 V7 \\\n",
+ "0 0.0 -1.359807 -0.072781 2.536347 1.378155 -0.338321 0.462388 0.239599 \n",
+ "1 0.0 1.191857 0.266151 0.166480 0.448154 0.060018 -0.082361 -0.078803 \n",
+ "2 1.0 -1.358354 -1.340163 1.773209 0.379780 -0.503198 1.800499 0.791461 \n",
+ "3 1.0 -0.966272 -0.185226 1.792993 -0.863291 -0.010309 1.247203 0.237609 \n",
+ "4 2.0 -1.158233 0.877737 1.548718 0.403034 -0.407193 0.095921 0.592941 \n",
+ "5 2.0 -0.425966 0.960523 1.141109 -0.168252 0.420987 -0.029728 0.476201 \n",
+ "6 4.0 1.229658 0.141004 0.045371 1.202613 0.191881 0.272708 -0.005159 \n",
+ "7 7.0 -0.644269 1.417964 1.074380 -0.492199 0.948934 0.428118 1.120631 \n",
+ "8 7.0 -0.894286 0.286157 -0.113192 -0.271526 2.669599 3.721818 0.370145 \n",
+ "9 9.0 -0.338262 1.119593 1.044367 -0.222187 0.499361 -0.246761 0.651583 \n",
+ "\n",
+ " V8 V9 ... V21 V22 V23 V24 V25 \\\n",
+ "0 0.098698 0.363787 ... -0.018307 0.277838 -0.110474 0.066928 0.128539 \n",
+ "1 0.085102 -0.255425 ... -0.225775 -0.638672 0.101288 -0.339846 0.167170 \n",
+ "2 0.247676 -1.514654 ... 0.247998 0.771679 0.909412 -0.689281 -0.327642 \n",
+ "3 0.377436 -1.387024 ... -0.108300 0.005274 -0.190321 -1.175575 0.647376 \n",
+ "4 -0.270533 0.817739 ... -0.009431 0.798278 -0.137458 0.141267 -0.206010 \n",
+ "5 0.260314 -0.568671 ... -0.208254 -0.559825 -0.026398 -0.371427 -0.232794 \n",
+ "6 0.081213 0.464960 ... -0.167716 -0.270710 -0.154104 -0.780055 0.750137 \n",
+ "7 -3.807864 0.615375 ... 1.943465 -1.015455 0.057504 -0.649709 -0.415267 \n",
+ "8 0.851084 -0.392048 ... -0.073425 -0.268092 -0.204233 1.011592 0.373205 \n",
+ "9 0.069539 -0.736727 ... -0.246914 -0.633753 -0.120794 -0.385050 -0.069733 \n",
+ "\n",
+ " V26 V27 V28 Amount Class \n",
+ "0 -0.189115 0.133558 -0.021053 149.62 0 \n",
+ "1 0.125895 -0.008983 0.014724 2.69 0 \n",
+ "2 -0.139097 -0.055353 -0.059752 378.66 0 \n",
+ "3 -0.221929 0.062723 0.061458 123.50 0 \n",
+ "4 0.502292 0.219422 0.215153 69.99 0 \n",
+ "5 0.105915 0.253844 0.081080 3.67 0 \n",
+ "6 -0.257237 0.034507 0.005168 4.99 0 \n",
+ "7 -0.051634 -1.206921 -1.085339 40.80 0 \n",
+ "8 -0.384157 0.011747 0.142404 93.20 0 \n",
+ "9 0.094199 0.246219 0.083076 3.68 0 \n",
+ "\n",
+ "[10 rows x 31 columns]"
+ ]
+ },
+ "execution_count": 23,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "from sklearn.ensemble import RandomForestClassifier\n",
+ "from sklearn.model_selection import GridSearchCV\n",
+ "from sklearn.model_selection import train_test_split\n",
+ "\n",
+ "df = pd.read_csv('creditcard.csv')\n",
+ "df.head(10)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 24,
+ "id": "0201a188",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "0 0.998273\n",
+ "1 0.001727\n",
+ "Name: Class, dtype: float64"
+ ]
+ },
+ "execution_count": 24,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "df['Class'].value_counts(normalize=True)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 25,
+ "id": "9ffa6a96",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "\n",
+ "RangeIndex: 284807 entries, 0 to 284806\n",
+ "Data columns (total 31 columns):\n",
+ " # Column Non-Null Count Dtype \n",
+ "--- ------ -------------- ----- \n",
+ " 0 Time 284807 non-null float64\n",
+ " 1 V1 284807 non-null float64\n",
+ " 2 V2 284807 non-null float64\n",
+ " 3 V3 284807 non-null float64\n",
+ " 4 V4 284807 non-null float64\n",
+ " 5 V5 284807 non-null float64\n",
+ " 6 V6 284807 non-null float64\n",
+ " 7 V7 284807 non-null float64\n",
+ " 8 V8 284807 non-null float64\n",
+ " 9 V9 284807 non-null float64\n",
+ " 10 V10 284807 non-null float64\n",
+ " 11 V11 284807 non-null float64\n",
+ " 12 V12 284807 non-null float64\n",
+ " 13 V13 284807 non-null float64\n",
+ " 14 V14 284807 non-null float64\n",
+ " 15 V15 284807 non-null float64\n",
+ " 16 V16 284807 non-null float64\n",
+ " 17 V17 284807 non-null float64\n",
+ " 18 V18 284807 non-null float64\n",
+ " 19 V19 284807 non-null float64\n",
+ " 20 V20 284807 non-null float64\n",
+ " 21 V21 284807 non-null float64\n",
+ " 22 V22 284807 non-null float64\n",
+ " 23 V23 284807 non-null float64\n",
+ " 24 V24 284807 non-null float64\n",
+ " 25 V25 284807 non-null float64\n",
+ " 26 V26 284807 non-null float64\n",
+ " 27 V27 284807 non-null float64\n",
+ " 28 V28 284807 non-null float64\n",
+ " 29 Amount 284807 non-null float64\n",
+ " 30 Class 284807 non-null int64 \n",
+ "dtypes: float64(30), int64(1)\n",
+ "memory usage: 67.4 MB\n"
+ ]
+ }
+ ],
+ "source": [
+ "df.info()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 26,
+ "id": "8d3439e3",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "pd.options.display.max_columns=100"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 27,
+ "id": "0f57a690",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " Time | \n",
+ " V1 | \n",
+ " V2 | \n",
+ " V3 | \n",
+ " V4 | \n",
+ " V5 | \n",
+ " V6 | \n",
+ " V7 | \n",
+ " V8 | \n",
+ " V9 | \n",
+ " V10 | \n",
+ " V11 | \n",
+ " V12 | \n",
+ " V13 | \n",
+ " V14 | \n",
+ " V15 | \n",
+ " V16 | \n",
+ " V17 | \n",
+ " V18 | \n",
+ " V19 | \n",
+ " V20 | \n",
+ " V21 | \n",
+ " V22 | \n",
+ " V23 | \n",
+ " V24 | \n",
+ " V25 | \n",
+ " V26 | \n",
+ " V27 | \n",
+ " V28 | \n",
+ " Amount | \n",
+ " Class | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 0 | \n",
+ " 0.0 | \n",
+ " -1.359807 | \n",
+ " -0.072781 | \n",
+ " 2.536347 | \n",
+ " 1.378155 | \n",
+ " -0.338321 | \n",
+ " 0.462388 | \n",
+ " 0.239599 | \n",
+ " 0.098698 | \n",
+ " 0.363787 | \n",
+ " 0.090794 | \n",
+ " -0.551600 | \n",
+ " -0.617801 | \n",
+ " -0.991390 | \n",
+ " -0.311169 | \n",
+ " 1.468177 | \n",
+ " -0.470401 | \n",
+ " 0.207971 | \n",
+ " 0.025791 | \n",
+ " 0.403993 | \n",
+ " 0.251412 | \n",
+ " -0.018307 | \n",
+ " 0.277838 | \n",
+ " -0.110474 | \n",
+ " 0.066928 | \n",
+ " 0.128539 | \n",
+ " -0.189115 | \n",
+ " 0.133558 | \n",
+ " -0.021053 | \n",
+ " 149.62 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " | 1 | \n",
+ " 0.0 | \n",
+ " 1.191857 | \n",
+ " 0.266151 | \n",
+ " 0.166480 | \n",
+ " 0.448154 | \n",
+ " 0.060018 | \n",
+ " -0.082361 | \n",
+ " -0.078803 | \n",
+ " 0.085102 | \n",
+ " -0.255425 | \n",
+ " -0.166974 | \n",
+ " 1.612727 | \n",
+ " 1.065235 | \n",
+ " 0.489095 | \n",
+ " -0.143772 | \n",
+ " 0.635558 | \n",
+ " 0.463917 | \n",
+ " -0.114805 | \n",
+ " -0.183361 | \n",
+ " -0.145783 | \n",
+ " -0.069083 | \n",
+ " -0.225775 | \n",
+ " -0.638672 | \n",
+ " 0.101288 | \n",
+ " -0.339846 | \n",
+ " 0.167170 | \n",
+ " 0.125895 | \n",
+ " -0.008983 | \n",
+ " 0.014724 | \n",
+ " 2.69 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " | 2 | \n",
+ " 1.0 | \n",
+ " -1.358354 | \n",
+ " -1.340163 | \n",
+ " 1.773209 | \n",
+ " 0.379780 | \n",
+ " -0.503198 | \n",
+ " 1.800499 | \n",
+ " 0.791461 | \n",
+ " 0.247676 | \n",
+ " -1.514654 | \n",
+ " 0.207643 | \n",
+ " 0.624501 | \n",
+ " 0.066084 | \n",
+ " 0.717293 | \n",
+ " -0.165946 | \n",
+ " 2.345865 | \n",
+ " -2.890083 | \n",
+ " 1.109969 | \n",
+ " -0.121359 | \n",
+ " -2.261857 | \n",
+ " 0.524980 | \n",
+ " 0.247998 | \n",
+ " 0.771679 | \n",
+ " 0.909412 | \n",
+ " -0.689281 | \n",
+ " -0.327642 | \n",
+ " -0.139097 | \n",
+ " -0.055353 | \n",
+ " -0.059752 | \n",
+ " 378.66 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " | 3 | \n",
+ " 1.0 | \n",
+ " -0.966272 | \n",
+ " -0.185226 | \n",
+ " 1.792993 | \n",
+ " -0.863291 | \n",
+ " -0.010309 | \n",
+ " 1.247203 | \n",
+ " 0.237609 | \n",
+ " 0.377436 | \n",
+ " -1.387024 | \n",
+ " -0.054952 | \n",
+ " -0.226487 | \n",
+ " 0.178228 | \n",
+ " 0.507757 | \n",
+ " -0.287924 | \n",
+ " -0.631418 | \n",
+ " -1.059647 | \n",
+ " -0.684093 | \n",
+ " 1.965775 | \n",
+ " -1.232622 | \n",
+ " -0.208038 | \n",
+ " -0.108300 | \n",
+ " 0.005274 | \n",
+ " -0.190321 | \n",
+ " -1.175575 | \n",
+ " 0.647376 | \n",
+ " -0.221929 | \n",
+ " 0.062723 | \n",
+ " 0.061458 | \n",
+ " 123.50 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " | 4 | \n",
+ " 2.0 | \n",
+ " -1.158233 | \n",
+ " 0.877737 | \n",
+ " 1.548718 | \n",
+ " 0.403034 | \n",
+ " -0.407193 | \n",
+ " 0.095921 | \n",
+ " 0.592941 | \n",
+ " -0.270533 | \n",
+ " 0.817739 | \n",
+ " 0.753074 | \n",
+ " -0.822843 | \n",
+ " 0.538196 | \n",
+ " 1.345852 | \n",
+ " -1.119670 | \n",
+ " 0.175121 | \n",
+ " -0.451449 | \n",
+ " -0.237033 | \n",
+ " -0.038195 | \n",
+ " 0.803487 | \n",
+ " 0.408542 | \n",
+ " -0.009431 | \n",
+ " 0.798278 | \n",
+ " -0.137458 | \n",
+ " 0.141267 | \n",
+ " -0.206010 | \n",
+ " 0.502292 | \n",
+ " 0.219422 | \n",
+ " 0.215153 | \n",
+ " 69.99 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " | 5 | \n",
+ " 2.0 | \n",
+ " -0.425966 | \n",
+ " 0.960523 | \n",
+ " 1.141109 | \n",
+ " -0.168252 | \n",
+ " 0.420987 | \n",
+ " -0.029728 | \n",
+ " 0.476201 | \n",
+ " 0.260314 | \n",
+ " -0.568671 | \n",
+ " -0.371407 | \n",
+ " 1.341262 | \n",
+ " 0.359894 | \n",
+ " -0.358091 | \n",
+ " -0.137134 | \n",
+ " 0.517617 | \n",
+ " 0.401726 | \n",
+ " -0.058133 | \n",
+ " 0.068653 | \n",
+ " -0.033194 | \n",
+ " 0.084968 | \n",
+ " -0.208254 | \n",
+ " -0.559825 | \n",
+ " -0.026398 | \n",
+ " -0.371427 | \n",
+ " -0.232794 | \n",
+ " 0.105915 | \n",
+ " 0.253844 | \n",
+ " 0.081080 | \n",
+ " 3.67 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " | 6 | \n",
+ " 4.0 | \n",
+ " 1.229658 | \n",
+ " 0.141004 | \n",
+ " 0.045371 | \n",
+ " 1.202613 | \n",
+ " 0.191881 | \n",
+ " 0.272708 | \n",
+ " -0.005159 | \n",
+ " 0.081213 | \n",
+ " 0.464960 | \n",
+ " -0.099254 | \n",
+ " -1.416907 | \n",
+ " -0.153826 | \n",
+ " -0.751063 | \n",
+ " 0.167372 | \n",
+ " 0.050144 | \n",
+ " -0.443587 | \n",
+ " 0.002821 | \n",
+ " -0.611987 | \n",
+ " -0.045575 | \n",
+ " -0.219633 | \n",
+ " -0.167716 | \n",
+ " -0.270710 | \n",
+ " -0.154104 | \n",
+ " -0.780055 | \n",
+ " 0.750137 | \n",
+ " -0.257237 | \n",
+ " 0.034507 | \n",
+ " 0.005168 | \n",
+ " 4.99 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " | 7 | \n",
+ " 7.0 | \n",
+ " -0.644269 | \n",
+ " 1.417964 | \n",
+ " 1.074380 | \n",
+ " -0.492199 | \n",
+ " 0.948934 | \n",
+ " 0.428118 | \n",
+ " 1.120631 | \n",
+ " -3.807864 | \n",
+ " 0.615375 | \n",
+ " 1.249376 | \n",
+ " -0.619468 | \n",
+ " 0.291474 | \n",
+ " 1.757964 | \n",
+ " -1.323865 | \n",
+ " 0.686133 | \n",
+ " -0.076127 | \n",
+ " -1.222127 | \n",
+ " -0.358222 | \n",
+ " 0.324505 | \n",
+ " -0.156742 | \n",
+ " 1.943465 | \n",
+ " -1.015455 | \n",
+ " 0.057504 | \n",
+ " -0.649709 | \n",
+ " -0.415267 | \n",
+ " -0.051634 | \n",
+ " -1.206921 | \n",
+ " -1.085339 | \n",
+ " 40.80 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " | 8 | \n",
+ " 7.0 | \n",
+ " -0.894286 | \n",
+ " 0.286157 | \n",
+ " -0.113192 | \n",
+ " -0.271526 | \n",
+ " 2.669599 | \n",
+ " 3.721818 | \n",
+ " 0.370145 | \n",
+ " 0.851084 | \n",
+ " -0.392048 | \n",
+ " -0.410430 | \n",
+ " -0.705117 | \n",
+ " -0.110452 | \n",
+ " -0.286254 | \n",
+ " 0.074355 | \n",
+ " -0.328783 | \n",
+ " -0.210077 | \n",
+ " -0.499768 | \n",
+ " 0.118765 | \n",
+ " 0.570328 | \n",
+ " 0.052736 | \n",
+ " -0.073425 | \n",
+ " -0.268092 | \n",
+ " -0.204233 | \n",
+ " 1.011592 | \n",
+ " 0.373205 | \n",
+ " -0.384157 | \n",
+ " 0.011747 | \n",
+ " 0.142404 | \n",
+ " 93.20 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " | 9 | \n",
+ " 9.0 | \n",
+ " -0.338262 | \n",
+ " 1.119593 | \n",
+ " 1.044367 | \n",
+ " -0.222187 | \n",
+ " 0.499361 | \n",
+ " -0.246761 | \n",
+ " 0.651583 | \n",
+ " 0.069539 | \n",
+ " -0.736727 | \n",
+ " -0.366846 | \n",
+ " 1.017614 | \n",
+ " 0.836390 | \n",
+ " 1.006844 | \n",
+ " -0.443523 | \n",
+ " 0.150219 | \n",
+ " 0.739453 | \n",
+ " -0.540980 | \n",
+ " 0.476677 | \n",
+ " 0.451773 | \n",
+ " 0.203711 | \n",
+ " -0.246914 | \n",
+ " -0.633753 | \n",
+ " -0.120794 | \n",
+ " -0.385050 | \n",
+ " -0.069733 | \n",
+ " 0.094199 | \n",
+ " 0.246219 | \n",
+ " 0.083076 | \n",
+ " 3.68 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " Time V1 V2 V3 V4 V5 V6 V7 \\\n",
+ "0 0.0 -1.359807 -0.072781 2.536347 1.378155 -0.338321 0.462388 0.239599 \n",
+ "1 0.0 1.191857 0.266151 0.166480 0.448154 0.060018 -0.082361 -0.078803 \n",
+ "2 1.0 -1.358354 -1.340163 1.773209 0.379780 -0.503198 1.800499 0.791461 \n",
+ "3 1.0 -0.966272 -0.185226 1.792993 -0.863291 -0.010309 1.247203 0.237609 \n",
+ "4 2.0 -1.158233 0.877737 1.548718 0.403034 -0.407193 0.095921 0.592941 \n",
+ "5 2.0 -0.425966 0.960523 1.141109 -0.168252 0.420987 -0.029728 0.476201 \n",
+ "6 4.0 1.229658 0.141004 0.045371 1.202613 0.191881 0.272708 -0.005159 \n",
+ "7 7.0 -0.644269 1.417964 1.074380 -0.492199 0.948934 0.428118 1.120631 \n",
+ "8 7.0 -0.894286 0.286157 -0.113192 -0.271526 2.669599 3.721818 0.370145 \n",
+ "9 9.0 -0.338262 1.119593 1.044367 -0.222187 0.499361 -0.246761 0.651583 \n",
+ "\n",
+ " V8 V9 V10 V11 V12 V13 V14 \\\n",
+ "0 0.098698 0.363787 0.090794 -0.551600 -0.617801 -0.991390 -0.311169 \n",
+ "1 0.085102 -0.255425 -0.166974 1.612727 1.065235 0.489095 -0.143772 \n",
+ "2 0.247676 -1.514654 0.207643 0.624501 0.066084 0.717293 -0.165946 \n",
+ "3 0.377436 -1.387024 -0.054952 -0.226487 0.178228 0.507757 -0.287924 \n",
+ "4 -0.270533 0.817739 0.753074 -0.822843 0.538196 1.345852 -1.119670 \n",
+ "5 0.260314 -0.568671 -0.371407 1.341262 0.359894 -0.358091 -0.137134 \n",
+ "6 0.081213 0.464960 -0.099254 -1.416907 -0.153826 -0.751063 0.167372 \n",
+ "7 -3.807864 0.615375 1.249376 -0.619468 0.291474 1.757964 -1.323865 \n",
+ "8 0.851084 -0.392048 -0.410430 -0.705117 -0.110452 -0.286254 0.074355 \n",
+ "9 0.069539 -0.736727 -0.366846 1.017614 0.836390 1.006844 -0.443523 \n",
+ "\n",
+ " V15 V16 V17 V18 V19 V20 V21 \\\n",
+ "0 1.468177 -0.470401 0.207971 0.025791 0.403993 0.251412 -0.018307 \n",
+ "1 0.635558 0.463917 -0.114805 -0.183361 -0.145783 -0.069083 -0.225775 \n",
+ "2 2.345865 -2.890083 1.109969 -0.121359 -2.261857 0.524980 0.247998 \n",
+ "3 -0.631418 -1.059647 -0.684093 1.965775 -1.232622 -0.208038 -0.108300 \n",
+ "4 0.175121 -0.451449 -0.237033 -0.038195 0.803487 0.408542 -0.009431 \n",
+ "5 0.517617 0.401726 -0.058133 0.068653 -0.033194 0.084968 -0.208254 \n",
+ "6 0.050144 -0.443587 0.002821 -0.611987 -0.045575 -0.219633 -0.167716 \n",
+ "7 0.686133 -0.076127 -1.222127 -0.358222 0.324505 -0.156742 1.943465 \n",
+ "8 -0.328783 -0.210077 -0.499768 0.118765 0.570328 0.052736 -0.073425 \n",
+ "9 0.150219 0.739453 -0.540980 0.476677 0.451773 0.203711 -0.246914 \n",
+ "\n",
+ " V22 V23 V24 V25 V26 V27 V28 \\\n",
+ "0 0.277838 -0.110474 0.066928 0.128539 -0.189115 0.133558 -0.021053 \n",
+ "1 -0.638672 0.101288 -0.339846 0.167170 0.125895 -0.008983 0.014724 \n",
+ "2 0.771679 0.909412 -0.689281 -0.327642 -0.139097 -0.055353 -0.059752 \n",
+ "3 0.005274 -0.190321 -1.175575 0.647376 -0.221929 0.062723 0.061458 \n",
+ "4 0.798278 -0.137458 0.141267 -0.206010 0.502292 0.219422 0.215153 \n",
+ "5 -0.559825 -0.026398 -0.371427 -0.232794 0.105915 0.253844 0.081080 \n",
+ "6 -0.270710 -0.154104 -0.780055 0.750137 -0.257237 0.034507 0.005168 \n",
+ "7 -1.015455 0.057504 -0.649709 -0.415267 -0.051634 -1.206921 -1.085339 \n",
+ "8 -0.268092 -0.204233 1.011592 0.373205 -0.384157 0.011747 0.142404 \n",
+ "9 -0.633753 -0.120794 -0.385050 -0.069733 0.094199 0.246219 0.083076 \n",
+ "\n",
+ " Amount Class \n",
+ "0 149.62 0 \n",
+ "1 2.69 0 \n",
+ "2 378.66 0 \n",
+ "3 123.50 0 \n",
+ "4 69.99 0 \n",
+ "5 3.67 0 \n",
+ "6 4.99 0 \n",
+ "7 40.80 0 \n",
+ "8 93.20 0 \n",
+ "9 3.68 0 "
+ ]
+ },
+ "execution_count": 27,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "df.head(10)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 28,
+ "id": "3530430c",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "X = df.drop(\"Class\", axis=1)\n",
+ "y = df[\"Class\"]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 29,
+ "id": "c66ea2e2",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=100, stratify=y)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 30,
+ "id": "1e628dd0",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "X_train (199364, 30)\n",
+ "X_test (85443, 30)\n",
+ "y_train (199364,)\n",
+ "y_test (85443,)\n"
+ ]
+ }
+ ],
+ "source": [
+ "print('X_train ', X_train.shape)\n",
+ "print('X_test ', X_test.shape)\n",
+ "print('y_train ', y_train.shape)\n",
+ "print('y_test ', y_test.shape)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 31,
+ "id": "8908147f",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "parameters = [{'n_estimators': [10, 15],'max_features': np.arange(3, 5),'max_depth': np.arange(4, 7)}]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 32,
+ "id": "8f593bef",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "clf = GridSearchCV(\n",
+ " estimator=RandomForestClassifier(random_state=100),\n",
+ " param_grid=parameters,\n",
+ " scoring='roc_auc',\n",
+ " cv=3,\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 33,
+ "id": "99d17337",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "GridSearchCV(cv=3, estimator=RandomForestClassifier(random_state=100),\n",
+ " param_grid=[{'max_depth': array([4, 5, 6]),\n",
+ " 'max_features': array([3, 4]),\n",
+ " 'n_estimators': [10, 15]}],\n",
+ " scoring='roc_auc')"
+ ]
+ },
+ "execution_count": 33,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "clf.fit(X_train, y_train)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 34,
+ "id": "79f8c7e5",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "{'max_depth': 6, 'max_features': 3, 'n_estimators': 15}"
+ ]
+ },
+ "execution_count": 34,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "clf.best_params_"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 35,
+ "id": "74efab0a",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "RandomForestClassifier(max_depth=6, max_features=3, n_estimators=15)"
+ ]
+ },
+ "execution_count": 35,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "clf = RandomForestClassifier(max_depth=6, max_features=3, n_estimators=15)\n",
+ "\n",
+ "clf.fit(X_train, y_train)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 36,
+ "id": "e4d1fe4a",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "y_pred = clf.predict_proba(X_test)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 37,
+ "id": "49357f79",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "y_pred_proba = y_pred[:, 1]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 38,
+ "id": "fc40ec74",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "from sklearn.metrics import roc_auc_score"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 39,
+ "id": "84b0112b",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "0.9476239854368701"
+ ]
+ },
+ "execution_count": 39,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "roc_auc_score(y_test, y_pred_proba)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "475ac08f",
+ "metadata": {},
+ "source": [
+ "# *Дополнительные задания:"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "3728fe05",
+ "metadata": {},
+ "source": [
+ "Загрузите датасет Wine из встроенных датасетов sklearn.datasets с помощью функции load_wine в переменную data."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 40,
+ "id": "e0600074",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "from sklearn.datasets import load_wine\n",
+ "data = load_wine()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "6e22dfd0",
+ "metadata": {},
+ "source": [
+ "Полученный датасет не является датафреймом. Это структура данных, имеющая ключи аналогично словарю. Просмотрите тип данных этой структуры данных и создайте список data_keys, содержащий ее ключи."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 41,
+ "id": "1cc31a29",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ " \n",
+ "\n",
+ "dict_keys(['data', 'target', 'frame', 'target_names', 'DESCR', 'feature_names'])\n"
+ ]
+ }
+ ],
+ "source": [
+ "print(type(data), '\\n')\n",
+ "data_keys = data.keys()\n",
+ "print(data_keys)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "05b0491c",
+ "metadata": {},
+ "source": [
+ "Просмотрите данные, описание и названия признаков в датасете. Описание нужно вывести в виде привычного, аккуратно оформленного текста, без обозначений переноса строки, но с самими переносами и т.д"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 42,
+ "id": "67997daa",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "array([[1.423e+01, 1.710e+00, 2.430e+00, ..., 1.040e+00, 3.920e+00,\n",
+ " 1.065e+03],\n",
+ " [1.320e+01, 1.780e+00, 2.140e+00, ..., 1.050e+00, 3.400e+00,\n",
+ " 1.050e+03],\n",
+ " [1.316e+01, 2.360e+00, 2.670e+00, ..., 1.030e+00, 3.170e+00,\n",
+ " 1.185e+03],\n",
+ " ...,\n",
+ " [1.327e+01, 4.280e+00, 2.260e+00, ..., 5.900e-01, 1.560e+00,\n",
+ " 8.350e+02],\n",
+ " [1.317e+01, 2.590e+00, 2.370e+00, ..., 6.000e-01, 1.620e+00,\n",
+ " 8.400e+02],\n",
+ " [1.413e+01, 4.100e+00, 2.740e+00, ..., 6.100e-01, 1.600e+00,\n",
+ " 5.600e+02]])"
+ ]
+ },
+ "execution_count": 42,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "data.data"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 43,
+ "id": "42bf0f1f",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ ".. _wine_dataset:\n",
+ "\n",
+ "Wine recognition dataset\n",
+ "------------------------\n",
+ "\n",
+ "**Data Set Characteristics:**\n",
+ "\n",
+ " :Number of Instances: 178 (50 in each of three classes)\n",
+ " :Number of Attributes: 13 numeric, predictive attributes and the class\n",
+ " :Attribute Information:\n",
+ " \t\t- Alcohol\n",
+ " \t\t- Malic acid\n",
+ " \t\t- Ash\n",
+ "\t\t- Alcalinity of ash \n",
+ " \t\t- Magnesium\n",
+ "\t\t- Total phenols\n",
+ " \t\t- Flavanoids\n",
+ " \t\t- Nonflavanoid phenols\n",
+ " \t\t- Proanthocyanins\n",
+ "\t\t- Color intensity\n",
+ " \t\t- Hue\n",
+ " \t\t- OD280/OD315 of diluted wines\n",
+ " \t\t- Proline\n",
+ "\n",
+ " - class:\n",
+ " - class_0\n",
+ " - class_1\n",
+ " - class_2\n",
+ "\t\t\n",
+ " :Summary Statistics:\n",
+ " \n",
+ " ============================= ==== ===== ======= =====\n",
+ " Min Max Mean SD\n",
+ " ============================= ==== ===== ======= =====\n",
+ " Alcohol: 11.0 14.8 13.0 0.8\n",
+ " Malic Acid: 0.74 5.80 2.34 1.12\n",
+ " Ash: 1.36 3.23 2.36 0.27\n",
+ " Alcalinity of Ash: 10.6 30.0 19.5 3.3\n",
+ " Magnesium: 70.0 162.0 99.7 14.3\n",
+ " Total Phenols: 0.98 3.88 2.29 0.63\n",
+ " Flavanoids: 0.34 5.08 2.03 1.00\n",
+ " Nonflavanoid Phenols: 0.13 0.66 0.36 0.12\n",
+ " Proanthocyanins: 0.41 3.58 1.59 0.57\n",
+ " Colour Intensity: 1.3 13.0 5.1 2.3\n",
+ " Hue: 0.48 1.71 0.96 0.23\n",
+ " OD280/OD315 of diluted wines: 1.27 4.00 2.61 0.71\n",
+ " Proline: 278 1680 746 315\n",
+ " ============================= ==== ===== ======= =====\n",
+ "\n",
+ " :Missing Attribute Values: None\n",
+ " :Class Distribution: class_0 (59), class_1 (71), class_2 (48)\n",
+ " :Creator: R.A. Fisher\n",
+ " :Donor: Michael Marshall (MARSHALL%PLU@io.arc.nasa.gov)\n",
+ " :Date: July, 1988\n",
+ "\n",
+ "This is a copy of UCI ML Wine recognition datasets.\n",
+ "https://archive.ics.uci.edu/ml/machine-learning-databases/wine/wine.data\n",
+ "\n",
+ "The data is the results of a chemical analysis of wines grown in the same\n",
+ "region in Italy by three different cultivators. There are thirteen different\n",
+ "measurements taken for different constituents found in the three types of\n",
+ "wine.\n",
+ "\n",
+ "Original Owners: \n",
+ "\n",
+ "Forina, M. et al, PARVUS - \n",
+ "An Extendible Package for Data Exploration, Classification and Correlation. \n",
+ "Institute of Pharmaceutical and Food Analysis and Technologies,\n",
+ "Via Brigata Salerno, 16147 Genoa, Italy.\n",
+ "\n",
+ "Citation:\n",
+ "\n",
+ "Lichman, M. (2013). UCI Machine Learning Repository\n",
+ "[https://archive.ics.uci.edu/ml]. Irvine, CA: University of California,\n",
+ "School of Information and Computer Science. \n",
+ "\n",
+ ".. topic:: References\n",
+ "\n",
+ " (1) S. Aeberhard, D. Coomans and O. de Vel, \n",
+ " Comparison of Classifiers in High Dimensional Settings, \n",
+ " Tech. Rep. no. 92-02, (1992), Dept. of Computer Science and Dept. of \n",
+ " Mathematics and Statistics, James Cook University of North Queensland. \n",
+ " (Also submitted to Technometrics). \n",
+ "\n",
+ " The data was used with many others for comparing various \n",
+ " classifiers. The classes are separable, though only RDA \n",
+ " has achieved 100% correct classification. \n",
+ " (RDA : 100%, QDA 99.4%, LDA 98.9%, 1NN 96.1% (z-transformed data)) \n",
+ " (All results using the leave-one-out technique) \n",
+ "\n",
+ " (2) S. Aeberhard, D. Coomans and O. de Vel, \n",
+ " \"THE CLASSIFICATION PERFORMANCE OF RDA\" \n",
+ " Tech. Rep. no. 92-01, (1992), Dept. of Computer Science and Dept. of \n",
+ " Mathematics and Statistics, James Cook University of North Queensland. \n",
+ " (Also submitted to Journal of Chemometrics).\n",
+ "\n"
+ ]
+ }
+ ],
+ "source": [
+ "print(data.DESCR)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 44,
+ "id": "3990394e",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "['alcohol',\n",
+ " 'malic_acid',\n",
+ " 'ash',\n",
+ " 'alcalinity_of_ash',\n",
+ " 'magnesium',\n",
+ " 'total_phenols',\n",
+ " 'flavanoids',\n",
+ " 'nonflavanoid_phenols',\n",
+ " 'proanthocyanins',\n",
+ " 'color_intensity',\n",
+ " 'hue',\n",
+ " 'od280/od315_of_diluted_wines',\n",
+ " 'proline']"
+ ]
+ },
+ "execution_count": 44,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "data.feature_names"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "ca3e3b90",
+ "metadata": {},
+ "source": [
+ "Сколько классов содержит целевая переменная датасета? Выведите названия классов."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 45,
+ "id": "3dcc2473",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Количество классов: (3,) \n",
+ "\n",
+ "Названия классов: ['class_0' 'class_1' 'class_2']\n"
+ ]
+ }
+ ],
+ "source": [
+ "print('Количество классов: ', np.unique(data[\"target\"]).shape, '\\n')\n",
+ "print('Названия классов: ',data[\"target_names\"])"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "4eb0d981",
+ "metadata": {},
+ "source": [
+ "На основе данных датасета (они содержатся в двумерном массиве Numpy) и названий признаков создайте датафрейм под названием X."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 46,
+ "id": "52257354",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " alcohol | \n",
+ " malic_acid | \n",
+ " ash | \n",
+ " alcalinity_of_ash | \n",
+ " magnesium | \n",
+ " total_phenols | \n",
+ " flavanoids | \n",
+ " nonflavanoid_phenols | \n",
+ " proanthocyanins | \n",
+ " color_intensity | \n",
+ " hue | \n",
+ " od280/od315_of_diluted_wines | \n",
+ " proline | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 0 | \n",
+ " 14.23 | \n",
+ " 1.71 | \n",
+ " 2.43 | \n",
+ " 15.6 | \n",
+ " 127.0 | \n",
+ " 2.80 | \n",
+ " 3.06 | \n",
+ " 0.28 | \n",
+ " 2.29 | \n",
+ " 5.64 | \n",
+ " 1.04 | \n",
+ " 3.92 | \n",
+ " 1065.0 | \n",
+ "
\n",
+ " \n",
+ " | 1 | \n",
+ " 13.20 | \n",
+ " 1.78 | \n",
+ " 2.14 | \n",
+ " 11.2 | \n",
+ " 100.0 | \n",
+ " 2.65 | \n",
+ " 2.76 | \n",
+ " 0.26 | \n",
+ " 1.28 | \n",
+ " 4.38 | \n",
+ " 1.05 | \n",
+ " 3.40 | \n",
+ " 1050.0 | \n",
+ "
\n",
+ " \n",
+ " | 2 | \n",
+ " 13.16 | \n",
+ " 2.36 | \n",
+ " 2.67 | \n",
+ " 18.6 | \n",
+ " 101.0 | \n",
+ " 2.80 | \n",
+ " 3.24 | \n",
+ " 0.30 | \n",
+ " 2.81 | \n",
+ " 5.68 | \n",
+ " 1.03 | \n",
+ " 3.17 | \n",
+ " 1185.0 | \n",
+ "
\n",
+ " \n",
+ " | 3 | \n",
+ " 14.37 | \n",
+ " 1.95 | \n",
+ " 2.50 | \n",
+ " 16.8 | \n",
+ " 113.0 | \n",
+ " 3.85 | \n",
+ " 3.49 | \n",
+ " 0.24 | \n",
+ " 2.18 | \n",
+ " 7.80 | \n",
+ " 0.86 | \n",
+ " 3.45 | \n",
+ " 1480.0 | \n",
+ "
\n",
+ " \n",
+ " | 4 | \n",
+ " 13.24 | \n",
+ " 2.59 | \n",
+ " 2.87 | \n",
+ " 21.0 | \n",
+ " 118.0 | \n",
+ " 2.80 | \n",
+ " 2.69 | \n",
+ " 0.39 | \n",
+ " 1.82 | \n",
+ " 4.32 | \n",
+ " 1.04 | \n",
+ " 2.93 | \n",
+ " 735.0 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " alcohol malic_acid ash alcalinity_of_ash magnesium total_phenols \\\n",
+ "0 14.23 1.71 2.43 15.6 127.0 2.80 \n",
+ "1 13.20 1.78 2.14 11.2 100.0 2.65 \n",
+ "2 13.16 2.36 2.67 18.6 101.0 2.80 \n",
+ "3 14.37 1.95 2.50 16.8 113.0 3.85 \n",
+ "4 13.24 2.59 2.87 21.0 118.0 2.80 \n",
+ "\n",
+ " flavanoids nonflavanoid_phenols proanthocyanins color_intensity hue \\\n",
+ "0 3.06 0.28 2.29 5.64 1.04 \n",
+ "1 2.76 0.26 1.28 4.38 1.05 \n",
+ "2 3.24 0.30 2.81 5.68 1.03 \n",
+ "3 3.49 0.24 2.18 7.80 0.86 \n",
+ "4 2.69 0.39 1.82 4.32 1.04 \n",
+ "\n",
+ " od280/od315_of_diluted_wines proline \n",
+ "0 3.92 1065.0 \n",
+ "1 3.40 1050.0 \n",
+ "2 3.17 1185.0 \n",
+ "3 3.45 1480.0 \n",
+ "4 2.93 735.0 "
+ ]
+ },
+ "execution_count": 46,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "X = pd.DataFrame(data.data, columns=data.feature_names)\n",
+ "X.head()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "917c33ed",
+ "metadata": {},
+ "source": [
+ "Выясните размер датафрейма X и установите, имеются ли в нем пропущенные значения."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 47,
+ "id": "f66d1569",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "(178, 13)"
+ ]
+ },
+ "execution_count": 47,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "X.shape"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 48,
+ "id": "4a1379f8",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "\n",
+ "RangeIndex: 178 entries, 0 to 177\n",
+ "Data columns (total 13 columns):\n",
+ " # Column Non-Null Count Dtype \n",
+ "--- ------ -------------- ----- \n",
+ " 0 alcohol 178 non-null float64\n",
+ " 1 malic_acid 178 non-null float64\n",
+ " 2 ash 178 non-null float64\n",
+ " 3 alcalinity_of_ash 178 non-null float64\n",
+ " 4 magnesium 178 non-null float64\n",
+ " 5 total_phenols 178 non-null float64\n",
+ " 6 flavanoids 178 non-null float64\n",
+ " 7 nonflavanoid_phenols 178 non-null float64\n",
+ " 8 proanthocyanins 178 non-null float64\n",
+ " 9 color_intensity 178 non-null float64\n",
+ " 10 hue 178 non-null float64\n",
+ " 11 od280/od315_of_diluted_wines 178 non-null float64\n",
+ " 12 proline 178 non-null float64\n",
+ "dtypes: float64(13)\n",
+ "memory usage: 18.2 KB\n"
+ ]
+ }
+ ],
+ "source": [
+ "X.info()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 49,
+ "id": "f5573521",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "alcohol 0\n",
+ "malic_acid 0\n",
+ "ash 0\n",
+ "alcalinity_of_ash 0\n",
+ "magnesium 0\n",
+ "total_phenols 0\n",
+ "flavanoids 0\n",
+ "nonflavanoid_phenols 0\n",
+ "proanthocyanins 0\n",
+ "color_intensity 0\n",
+ "hue 0\n",
+ "od280/od315_of_diluted_wines 0\n",
+ "proline 0\n",
+ "dtype: int64"
+ ]
+ },
+ "execution_count": 49,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "X.isnull().astype(\"int\").sum()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "7fcfb081",
+ "metadata": {},
+ "source": [
+ "Добавьте в датафрейм поле с классами вин в виде чисел, имеющих тип данных numpy.int64. Название поля - 'target'."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 50,
+ "id": "89d0aa13",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "\n",
+ "RangeIndex: 178 entries, 0 to 177\n",
+ "Data columns (total 14 columns):\n",
+ " # Column Non-Null Count Dtype \n",
+ "--- ------ -------------- ----- \n",
+ " 0 alcohol 178 non-null float64\n",
+ " 1 malic_acid 178 non-null float64\n",
+ " 2 ash 178 non-null float64\n",
+ " 3 alcalinity_of_ash 178 non-null float64\n",
+ " 4 magnesium 178 non-null float64\n",
+ " 5 total_phenols 178 non-null float64\n",
+ " 6 flavanoids 178 non-null float64\n",
+ " 7 nonflavanoid_phenols 178 non-null float64\n",
+ " 8 proanthocyanins 178 non-null float64\n",
+ " 9 color_intensity 178 non-null float64\n",
+ " 10 hue 178 non-null float64\n",
+ " 11 od280/od315_of_diluted_wines 178 non-null float64\n",
+ " 12 proline 178 non-null float64\n",
+ " 13 target 178 non-null int64 \n",
+ "dtypes: float64(13), int64(1)\n",
+ "memory usage: 19.6 KB\n"
+ ]
+ }
+ ],
+ "source": [
+ "X[\"target\"]=data[\"target\"].astype(np.int64)\n",
+ "X.info()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 51,
+ "id": "50bcdef6",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " alcohol | \n",
+ " malic_acid | \n",
+ " ash | \n",
+ " alcalinity_of_ash | \n",
+ " magnesium | \n",
+ " total_phenols | \n",
+ " flavanoids | \n",
+ " nonflavanoid_phenols | \n",
+ " proanthocyanins | \n",
+ " color_intensity | \n",
+ " hue | \n",
+ " od280/od315_of_diluted_wines | \n",
+ " proline | \n",
+ " target | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 0 | \n",
+ " 14.23 | \n",
+ " 1.71 | \n",
+ " 2.43 | \n",
+ " 15.6 | \n",
+ " 127.0 | \n",
+ " 2.80 | \n",
+ " 3.06 | \n",
+ " 0.28 | \n",
+ " 2.29 | \n",
+ " 5.64 | \n",
+ " 1.04 | \n",
+ " 3.92 | \n",
+ " 1065.0 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " | 1 | \n",
+ " 13.20 | \n",
+ " 1.78 | \n",
+ " 2.14 | \n",
+ " 11.2 | \n",
+ " 100.0 | \n",
+ " 2.65 | \n",
+ " 2.76 | \n",
+ " 0.26 | \n",
+ " 1.28 | \n",
+ " 4.38 | \n",
+ " 1.05 | \n",
+ " 3.40 | \n",
+ " 1050.0 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " | 2 | \n",
+ " 13.16 | \n",
+ " 2.36 | \n",
+ " 2.67 | \n",
+ " 18.6 | \n",
+ " 101.0 | \n",
+ " 2.80 | \n",
+ " 3.24 | \n",
+ " 0.30 | \n",
+ " 2.81 | \n",
+ " 5.68 | \n",
+ " 1.03 | \n",
+ " 3.17 | \n",
+ " 1185.0 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " | 3 | \n",
+ " 14.37 | \n",
+ " 1.95 | \n",
+ " 2.50 | \n",
+ " 16.8 | \n",
+ " 113.0 | \n",
+ " 3.85 | \n",
+ " 3.49 | \n",
+ " 0.24 | \n",
+ " 2.18 | \n",
+ " 7.80 | \n",
+ " 0.86 | \n",
+ " 3.45 | \n",
+ " 1480.0 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " | 4 | \n",
+ " 13.24 | \n",
+ " 2.59 | \n",
+ " 2.87 | \n",
+ " 21.0 | \n",
+ " 118.0 | \n",
+ " 2.80 | \n",
+ " 2.69 | \n",
+ " 0.39 | \n",
+ " 1.82 | \n",
+ " 4.32 | \n",
+ " 1.04 | \n",
+ " 2.93 | \n",
+ " 735.0 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " alcohol malic_acid ash alcalinity_of_ash magnesium total_phenols \\\n",
+ "0 14.23 1.71 2.43 15.6 127.0 2.80 \n",
+ "1 13.20 1.78 2.14 11.2 100.0 2.65 \n",
+ "2 13.16 2.36 2.67 18.6 101.0 2.80 \n",
+ "3 14.37 1.95 2.50 16.8 113.0 3.85 \n",
+ "4 13.24 2.59 2.87 21.0 118.0 2.80 \n",
+ "\n",
+ " flavanoids nonflavanoid_phenols proanthocyanins color_intensity hue \\\n",
+ "0 3.06 0.28 2.29 5.64 1.04 \n",
+ "1 2.76 0.26 1.28 4.38 1.05 \n",
+ "2 3.24 0.30 2.81 5.68 1.03 \n",
+ "3 3.49 0.24 2.18 7.80 0.86 \n",
+ "4 2.69 0.39 1.82 4.32 1.04 \n",
+ "\n",
+ " od280/od315_of_diluted_wines proline target \n",
+ "0 3.92 1065.0 0 \n",
+ "1 3.40 1050.0 0 \n",
+ "2 3.17 1185.0 0 \n",
+ "3 3.45 1480.0 0 \n",
+ "4 2.93 735.0 0 "
+ ]
+ },
+ "execution_count": 51,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "X.head()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "704ea79c",
+ "metadata": {},
+ "source": [
+ "Постройте матрицу корреляций для всех полей X. Дайте полученному датафрейму название X_corr."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 52,
+ "id": "41d5c34c",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " alcohol | \n",
+ " malic_acid | \n",
+ " ash | \n",
+ " alcalinity_of_ash | \n",
+ " magnesium | \n",
+ " total_phenols | \n",
+ " flavanoids | \n",
+ " nonflavanoid_phenols | \n",
+ " proanthocyanins | \n",
+ " color_intensity | \n",
+ " hue | \n",
+ " od280/od315_of_diluted_wines | \n",
+ " proline | \n",
+ " target | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | alcohol | \n",
+ " 1.000000 | \n",
+ " 0.094397 | \n",
+ " 0.211545 | \n",
+ " -0.310235 | \n",
+ " 0.270798 | \n",
+ " 0.289101 | \n",
+ " 0.236815 | \n",
+ " -0.155929 | \n",
+ " 0.136698 | \n",
+ " 0.546364 | \n",
+ " -0.071747 | \n",
+ " 0.072343 | \n",
+ " 0.643720 | \n",
+ " -0.328222 | \n",
+ "
\n",
+ " \n",
+ " | malic_acid | \n",
+ " 0.094397 | \n",
+ " 1.000000 | \n",
+ " 0.164045 | \n",
+ " 0.288500 | \n",
+ " -0.054575 | \n",
+ " -0.335167 | \n",
+ " -0.411007 | \n",
+ " 0.292977 | \n",
+ " -0.220746 | \n",
+ " 0.248985 | \n",
+ " -0.561296 | \n",
+ " -0.368710 | \n",
+ " -0.192011 | \n",
+ " 0.437776 | \n",
+ "
\n",
+ " \n",
+ " | ash | \n",
+ " 0.211545 | \n",
+ " 0.164045 | \n",
+ " 1.000000 | \n",
+ " 0.443367 | \n",
+ " 0.286587 | \n",
+ " 0.128980 | \n",
+ " 0.115077 | \n",
+ " 0.186230 | \n",
+ " 0.009652 | \n",
+ " 0.258887 | \n",
+ " -0.074667 | \n",
+ " 0.003911 | \n",
+ " 0.223626 | \n",
+ " -0.049643 | \n",
+ "
\n",
+ " \n",
+ " | alcalinity_of_ash | \n",
+ " -0.310235 | \n",
+ " 0.288500 | \n",
+ " 0.443367 | \n",
+ " 1.000000 | \n",
+ " -0.083333 | \n",
+ " -0.321113 | \n",
+ " -0.351370 | \n",
+ " 0.361922 | \n",
+ " -0.197327 | \n",
+ " 0.018732 | \n",
+ " -0.273955 | \n",
+ " -0.276769 | \n",
+ " -0.440597 | \n",
+ " 0.517859 | \n",
+ "
\n",
+ " \n",
+ " | magnesium | \n",
+ " 0.270798 | \n",
+ " -0.054575 | \n",
+ " 0.286587 | \n",
+ " -0.083333 | \n",
+ " 1.000000 | \n",
+ " 0.214401 | \n",
+ " 0.195784 | \n",
+ " -0.256294 | \n",
+ " 0.236441 | \n",
+ " 0.199950 | \n",
+ " 0.055398 | \n",
+ " 0.066004 | \n",
+ " 0.393351 | \n",
+ " -0.209179 | \n",
+ "
\n",
+ " \n",
+ " | total_phenols | \n",
+ " 0.289101 | \n",
+ " -0.335167 | \n",
+ " 0.128980 | \n",
+ " -0.321113 | \n",
+ " 0.214401 | \n",
+ " 1.000000 | \n",
+ " 0.864564 | \n",
+ " -0.449935 | \n",
+ " 0.612413 | \n",
+ " -0.055136 | \n",
+ " 0.433681 | \n",
+ " 0.699949 | \n",
+ " 0.498115 | \n",
+ " -0.719163 | \n",
+ "
\n",
+ " \n",
+ " | flavanoids | \n",
+ " 0.236815 | \n",
+ " -0.411007 | \n",
+ " 0.115077 | \n",
+ " -0.351370 | \n",
+ " 0.195784 | \n",
+ " 0.864564 | \n",
+ " 1.000000 | \n",
+ " -0.537900 | \n",
+ " 0.652692 | \n",
+ " -0.172379 | \n",
+ " 0.543479 | \n",
+ " 0.787194 | \n",
+ " 0.494193 | \n",
+ " -0.847498 | \n",
+ "
\n",
+ " \n",
+ " | nonflavanoid_phenols | \n",
+ " -0.155929 | \n",
+ " 0.292977 | \n",
+ " 0.186230 | \n",
+ " 0.361922 | \n",
+ " -0.256294 | \n",
+ " -0.449935 | \n",
+ " -0.537900 | \n",
+ " 1.000000 | \n",
+ " -0.365845 | \n",
+ " 0.139057 | \n",
+ " -0.262640 | \n",
+ " -0.503270 | \n",
+ " -0.311385 | \n",
+ " 0.489109 | \n",
+ "
\n",
+ " \n",
+ " | proanthocyanins | \n",
+ " 0.136698 | \n",
+ " -0.220746 | \n",
+ " 0.009652 | \n",
+ " -0.197327 | \n",
+ " 0.236441 | \n",
+ " 0.612413 | \n",
+ " 0.652692 | \n",
+ " -0.365845 | \n",
+ " 1.000000 | \n",
+ " -0.025250 | \n",
+ " 0.295544 | \n",
+ " 0.519067 | \n",
+ " 0.330417 | \n",
+ " -0.499130 | \n",
+ "
\n",
+ " \n",
+ " | color_intensity | \n",
+ " 0.546364 | \n",
+ " 0.248985 | \n",
+ " 0.258887 | \n",
+ " 0.018732 | \n",
+ " 0.199950 | \n",
+ " -0.055136 | \n",
+ " -0.172379 | \n",
+ " 0.139057 | \n",
+ " -0.025250 | \n",
+ " 1.000000 | \n",
+ " -0.521813 | \n",
+ " -0.428815 | \n",
+ " 0.316100 | \n",
+ " 0.265668 | \n",
+ "
\n",
+ " \n",
+ " | hue | \n",
+ " -0.071747 | \n",
+ " -0.561296 | \n",
+ " -0.074667 | \n",
+ " -0.273955 | \n",
+ " 0.055398 | \n",
+ " 0.433681 | \n",
+ " 0.543479 | \n",
+ " -0.262640 | \n",
+ " 0.295544 | \n",
+ " -0.521813 | \n",
+ " 1.000000 | \n",
+ " 0.565468 | \n",
+ " 0.236183 | \n",
+ " -0.617369 | \n",
+ "
\n",
+ " \n",
+ " | od280/od315_of_diluted_wines | \n",
+ " 0.072343 | \n",
+ " -0.368710 | \n",
+ " 0.003911 | \n",
+ " -0.276769 | \n",
+ " 0.066004 | \n",
+ " 0.699949 | \n",
+ " 0.787194 | \n",
+ " -0.503270 | \n",
+ " 0.519067 | \n",
+ " -0.428815 | \n",
+ " 0.565468 | \n",
+ " 1.000000 | \n",
+ " 0.312761 | \n",
+ " -0.788230 | \n",
+ "
\n",
+ " \n",
+ " | proline | \n",
+ " 0.643720 | \n",
+ " -0.192011 | \n",
+ " 0.223626 | \n",
+ " -0.440597 | \n",
+ " 0.393351 | \n",
+ " 0.498115 | \n",
+ " 0.494193 | \n",
+ " -0.311385 | \n",
+ " 0.330417 | \n",
+ " 0.316100 | \n",
+ " 0.236183 | \n",
+ " 0.312761 | \n",
+ " 1.000000 | \n",
+ " -0.633717 | \n",
+ "
\n",
+ " \n",
+ " | target | \n",
+ " -0.328222 | \n",
+ " 0.437776 | \n",
+ " -0.049643 | \n",
+ " 0.517859 | \n",
+ " -0.209179 | \n",
+ " -0.719163 | \n",
+ " -0.847498 | \n",
+ " 0.489109 | \n",
+ " -0.499130 | \n",
+ " 0.265668 | \n",
+ " -0.617369 | \n",
+ " -0.788230 | \n",
+ " -0.633717 | \n",
+ " 1.000000 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " alcohol malic_acid ash \\\n",
+ "alcohol 1.000000 0.094397 0.211545 \n",
+ "malic_acid 0.094397 1.000000 0.164045 \n",
+ "ash 0.211545 0.164045 1.000000 \n",
+ "alcalinity_of_ash -0.310235 0.288500 0.443367 \n",
+ "magnesium 0.270798 -0.054575 0.286587 \n",
+ "total_phenols 0.289101 -0.335167 0.128980 \n",
+ "flavanoids 0.236815 -0.411007 0.115077 \n",
+ "nonflavanoid_phenols -0.155929 0.292977 0.186230 \n",
+ "proanthocyanins 0.136698 -0.220746 0.009652 \n",
+ "color_intensity 0.546364 0.248985 0.258887 \n",
+ "hue -0.071747 -0.561296 -0.074667 \n",
+ "od280/od315_of_diluted_wines 0.072343 -0.368710 0.003911 \n",
+ "proline 0.643720 -0.192011 0.223626 \n",
+ "target -0.328222 0.437776 -0.049643 \n",
+ "\n",
+ " alcalinity_of_ash magnesium total_phenols \\\n",
+ "alcohol -0.310235 0.270798 0.289101 \n",
+ "malic_acid 0.288500 -0.054575 -0.335167 \n",
+ "ash 0.443367 0.286587 0.128980 \n",
+ "alcalinity_of_ash 1.000000 -0.083333 -0.321113 \n",
+ "magnesium -0.083333 1.000000 0.214401 \n",
+ "total_phenols -0.321113 0.214401 1.000000 \n",
+ "flavanoids -0.351370 0.195784 0.864564 \n",
+ "nonflavanoid_phenols 0.361922 -0.256294 -0.449935 \n",
+ "proanthocyanins -0.197327 0.236441 0.612413 \n",
+ "color_intensity 0.018732 0.199950 -0.055136 \n",
+ "hue -0.273955 0.055398 0.433681 \n",
+ "od280/od315_of_diluted_wines -0.276769 0.066004 0.699949 \n",
+ "proline -0.440597 0.393351 0.498115 \n",
+ "target 0.517859 -0.209179 -0.719163 \n",
+ "\n",
+ " flavanoids nonflavanoid_phenols \\\n",
+ "alcohol 0.236815 -0.155929 \n",
+ "malic_acid -0.411007 0.292977 \n",
+ "ash 0.115077 0.186230 \n",
+ "alcalinity_of_ash -0.351370 0.361922 \n",
+ "magnesium 0.195784 -0.256294 \n",
+ "total_phenols 0.864564 -0.449935 \n",
+ "flavanoids 1.000000 -0.537900 \n",
+ "nonflavanoid_phenols -0.537900 1.000000 \n",
+ "proanthocyanins 0.652692 -0.365845 \n",
+ "color_intensity -0.172379 0.139057 \n",
+ "hue 0.543479 -0.262640 \n",
+ "od280/od315_of_diluted_wines 0.787194 -0.503270 \n",
+ "proline 0.494193 -0.311385 \n",
+ "target -0.847498 0.489109 \n",
+ "\n",
+ " proanthocyanins color_intensity hue \\\n",
+ "alcohol 0.136698 0.546364 -0.071747 \n",
+ "malic_acid -0.220746 0.248985 -0.561296 \n",
+ "ash 0.009652 0.258887 -0.074667 \n",
+ "alcalinity_of_ash -0.197327 0.018732 -0.273955 \n",
+ "magnesium 0.236441 0.199950 0.055398 \n",
+ "total_phenols 0.612413 -0.055136 0.433681 \n",
+ "flavanoids 0.652692 -0.172379 0.543479 \n",
+ "nonflavanoid_phenols -0.365845 0.139057 -0.262640 \n",
+ "proanthocyanins 1.000000 -0.025250 0.295544 \n",
+ "color_intensity -0.025250 1.000000 -0.521813 \n",
+ "hue 0.295544 -0.521813 1.000000 \n",
+ "od280/od315_of_diluted_wines 0.519067 -0.428815 0.565468 \n",
+ "proline 0.330417 0.316100 0.236183 \n",
+ "target -0.499130 0.265668 -0.617369 \n",
+ "\n",
+ " od280/od315_of_diluted_wines proline target \n",
+ "alcohol 0.072343 0.643720 -0.328222 \n",
+ "malic_acid -0.368710 -0.192011 0.437776 \n",
+ "ash 0.003911 0.223626 -0.049643 \n",
+ "alcalinity_of_ash -0.276769 -0.440597 0.517859 \n",
+ "magnesium 0.066004 0.393351 -0.209179 \n",
+ "total_phenols 0.699949 0.498115 -0.719163 \n",
+ "flavanoids 0.787194 0.494193 -0.847498 \n",
+ "nonflavanoid_phenols -0.503270 -0.311385 0.489109 \n",
+ "proanthocyanins 0.519067 0.330417 -0.499130 \n",
+ "color_intensity -0.428815 0.316100 0.265668 \n",
+ "hue 0.565468 0.236183 -0.617369 \n",
+ "od280/od315_of_diluted_wines 1.000000 0.312761 -0.788230 \n",
+ "proline 0.312761 1.000000 -0.633717 \n",
+ "target -0.788230 -0.633717 1.000000 "
+ ]
+ },
+ "execution_count": 52,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "X_corr = X.corr()\n",
+ "X_corr"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "1ed7c122",
+ "metadata": {},
+ "source": [
+ "Создайте список high_corr из признаков, корреляция которых с полем target по абсолютному значению превышает 0.5 (причем, само поле target не должно входить в этот список)."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 53,
+ "id": "6edf6763",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "Index(['alcalinity_of_ash', 'total_phenols', 'flavanoids', 'hue',\n",
+ " 'od280/od315_of_diluted_wines', 'proline'],\n",
+ " dtype='object')"
+ ]
+ },
+ "execution_count": 53,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "high_corr = X_corr.loc[(X_corr.index != 'target') & (abs(X_corr['target']) > .5), X_corr.columns != 'target'].index\n",
+ "high_corr"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "f0ff52e6",
+ "metadata": {},
+ "source": [
+ "Удалите из датафрейма X поле с целевой переменной. Для всех признаков, названия которых содержатся в списке high_corr, вычислите квадрат их значений и добавьте в датафрейм X соответствующие поля с суффиксом '_2', добавленного к первоначальному названию признака. Итоговый датафрейм должен содержать все поля, которые, были в нем изначально, а также поля с признаками из списка high_corr, возведенными в квадрат. Выведите описание полей датафрейма X с помощью метода describe."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 54,
+ "id": "1e1403ec",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " alcohol | \n",
+ " malic_acid | \n",
+ " ash | \n",
+ " alcalinity_of_ash | \n",
+ " magnesium | \n",
+ " total_phenols | \n",
+ " flavanoids | \n",
+ " nonflavanoid_phenols | \n",
+ " proanthocyanins | \n",
+ " color_intensity | \n",
+ " hue | \n",
+ " od280/od315_of_diluted_wines | \n",
+ " proline | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 0 | \n",
+ " 14.23 | \n",
+ " 1.71 | \n",
+ " 2.43 | \n",
+ " 15.6 | \n",
+ " 127.0 | \n",
+ " 2.80 | \n",
+ " 3.06 | \n",
+ " 0.28 | \n",
+ " 2.29 | \n",
+ " 5.64 | \n",
+ " 1.04 | \n",
+ " 3.92 | \n",
+ " 1065.0 | \n",
+ "
\n",
+ " \n",
+ " | 1 | \n",
+ " 13.20 | \n",
+ " 1.78 | \n",
+ " 2.14 | \n",
+ " 11.2 | \n",
+ " 100.0 | \n",
+ " 2.65 | \n",
+ " 2.76 | \n",
+ " 0.26 | \n",
+ " 1.28 | \n",
+ " 4.38 | \n",
+ " 1.05 | \n",
+ " 3.40 | \n",
+ " 1050.0 | \n",
+ "
\n",
+ " \n",
+ " | 2 | \n",
+ " 13.16 | \n",
+ " 2.36 | \n",
+ " 2.67 | \n",
+ " 18.6 | \n",
+ " 101.0 | \n",
+ " 2.80 | \n",
+ " 3.24 | \n",
+ " 0.30 | \n",
+ " 2.81 | \n",
+ " 5.68 | \n",
+ " 1.03 | \n",
+ " 3.17 | \n",
+ " 1185.0 | \n",
+ "
\n",
+ " \n",
+ " | 3 | \n",
+ " 14.37 | \n",
+ " 1.95 | \n",
+ " 2.50 | \n",
+ " 16.8 | \n",
+ " 113.0 | \n",
+ " 3.85 | \n",
+ " 3.49 | \n",
+ " 0.24 | \n",
+ " 2.18 | \n",
+ " 7.80 | \n",
+ " 0.86 | \n",
+ " 3.45 | \n",
+ " 1480.0 | \n",
+ "
\n",
+ " \n",
+ " | 4 | \n",
+ " 13.24 | \n",
+ " 2.59 | \n",
+ " 2.87 | \n",
+ " 21.0 | \n",
+ " 118.0 | \n",
+ " 2.80 | \n",
+ " 2.69 | \n",
+ " 0.39 | \n",
+ " 1.82 | \n",
+ " 4.32 | \n",
+ " 1.04 | \n",
+ " 2.93 | \n",
+ " 735.0 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " alcohol malic_acid ash alcalinity_of_ash magnesium total_phenols \\\n",
+ "0 14.23 1.71 2.43 15.6 127.0 2.80 \n",
+ "1 13.20 1.78 2.14 11.2 100.0 2.65 \n",
+ "2 13.16 2.36 2.67 18.6 101.0 2.80 \n",
+ "3 14.37 1.95 2.50 16.8 113.0 3.85 \n",
+ "4 13.24 2.59 2.87 21.0 118.0 2.80 \n",
+ "\n",
+ " flavanoids nonflavanoid_phenols proanthocyanins color_intensity hue \\\n",
+ "0 3.06 0.28 2.29 5.64 1.04 \n",
+ "1 2.76 0.26 1.28 4.38 1.05 \n",
+ "2 3.24 0.30 2.81 5.68 1.03 \n",
+ "3 3.49 0.24 2.18 7.80 0.86 \n",
+ "4 2.69 0.39 1.82 4.32 1.04 \n",
+ "\n",
+ " od280/od315_of_diluted_wines proline \n",
+ "0 3.92 1065.0 \n",
+ "1 3.40 1050.0 \n",
+ "2 3.17 1185.0 \n",
+ "3 3.45 1480.0 \n",
+ "4 2.93 735.0 "
+ ]
+ },
+ "execution_count": 54,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "X = X.drop('target', axis=1)\n",
+ "X.head()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 56,
+ "id": "74173e8d",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " alcohol | \n",
+ " malic_acid | \n",
+ " ash | \n",
+ " alcalinity_of_ash | \n",
+ " magnesium | \n",
+ " total_phenols | \n",
+ " flavanoids | \n",
+ " nonflavanoid_phenols | \n",
+ " proanthocyanins | \n",
+ " color_intensity | \n",
+ " hue | \n",
+ " od280/od315_of_diluted_wines | \n",
+ " proline | \n",
+ " alcalinity_of_ash_2 | \n",
+ " total_phenols_2 | \n",
+ " flavanoids_2 | \n",
+ " hue_2 | \n",
+ " od280/od315_of_diluted_wines_2 | \n",
+ " proline_2 | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 0 | \n",
+ " 14.23 | \n",
+ " 1.71 | \n",
+ " 2.43 | \n",
+ " 15.6 | \n",
+ " 127.0 | \n",
+ " 2.80 | \n",
+ " 3.06 | \n",
+ " 0.28 | \n",
+ " 2.29 | \n",
+ " 5.64 | \n",
+ " 1.04 | \n",
+ " 3.92 | \n",
+ " 1065.0 | \n",
+ " 243.36 | \n",
+ " 7.8400 | \n",
+ " 9.3636 | \n",
+ " 1.0816 | \n",
+ " 15.3664 | \n",
+ " 1134225.0 | \n",
+ "
\n",
+ " \n",
+ " | 1 | \n",
+ " 13.20 | \n",
+ " 1.78 | \n",
+ " 2.14 | \n",
+ " 11.2 | \n",
+ " 100.0 | \n",
+ " 2.65 | \n",
+ " 2.76 | \n",
+ " 0.26 | \n",
+ " 1.28 | \n",
+ " 4.38 | \n",
+ " 1.05 | \n",
+ " 3.40 | \n",
+ " 1050.0 | \n",
+ " 125.44 | \n",
+ " 7.0225 | \n",
+ " 7.6176 | \n",
+ " 1.1025 | \n",
+ " 11.5600 | \n",
+ " 1102500.0 | \n",
+ "
\n",
+ " \n",
+ " | 2 | \n",
+ " 13.16 | \n",
+ " 2.36 | \n",
+ " 2.67 | \n",
+ " 18.6 | \n",
+ " 101.0 | \n",
+ " 2.80 | \n",
+ " 3.24 | \n",
+ " 0.30 | \n",
+ " 2.81 | \n",
+ " 5.68 | \n",
+ " 1.03 | \n",
+ " 3.17 | \n",
+ " 1185.0 | \n",
+ " 345.96 | \n",
+ " 7.8400 | \n",
+ " 10.4976 | \n",
+ " 1.0609 | \n",
+ " 10.0489 | \n",
+ " 1404225.0 | \n",
+ "
\n",
+ " \n",
+ " | 3 | \n",
+ " 14.37 | \n",
+ " 1.95 | \n",
+ " 2.50 | \n",
+ " 16.8 | \n",
+ " 113.0 | \n",
+ " 3.85 | \n",
+ " 3.49 | \n",
+ " 0.24 | \n",
+ " 2.18 | \n",
+ " 7.80 | \n",
+ " 0.86 | \n",
+ " 3.45 | \n",
+ " 1480.0 | \n",
+ " 282.24 | \n",
+ " 14.8225 | \n",
+ " 12.1801 | \n",
+ " 0.7396 | \n",
+ " 11.9025 | \n",
+ " 2190400.0 | \n",
+ "
\n",
+ " \n",
+ " | 4 | \n",
+ " 13.24 | \n",
+ " 2.59 | \n",
+ " 2.87 | \n",
+ " 21.0 | \n",
+ " 118.0 | \n",
+ " 2.80 | \n",
+ " 2.69 | \n",
+ " 0.39 | \n",
+ " 1.82 | \n",
+ " 4.32 | \n",
+ " 1.04 | \n",
+ " 2.93 | \n",
+ " 735.0 | \n",
+ " 441.00 | \n",
+ " 7.8400 | \n",
+ " 7.2361 | \n",
+ " 1.0816 | \n",
+ " 8.5849 | \n",
+ " 540225.0 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " alcohol malic_acid ash alcalinity_of_ash magnesium total_phenols \\\n",
+ "0 14.23 1.71 2.43 15.6 127.0 2.80 \n",
+ "1 13.20 1.78 2.14 11.2 100.0 2.65 \n",
+ "2 13.16 2.36 2.67 18.6 101.0 2.80 \n",
+ "3 14.37 1.95 2.50 16.8 113.0 3.85 \n",
+ "4 13.24 2.59 2.87 21.0 118.0 2.80 \n",
+ "\n",
+ " flavanoids nonflavanoid_phenols proanthocyanins color_intensity hue \\\n",
+ "0 3.06 0.28 2.29 5.64 1.04 \n",
+ "1 2.76 0.26 1.28 4.38 1.05 \n",
+ "2 3.24 0.30 2.81 5.68 1.03 \n",
+ "3 3.49 0.24 2.18 7.80 0.86 \n",
+ "4 2.69 0.39 1.82 4.32 1.04 \n",
+ "\n",
+ " od280/od315_of_diluted_wines proline alcalinity_of_ash_2 \\\n",
+ "0 3.92 1065.0 243.36 \n",
+ "1 3.40 1050.0 125.44 \n",
+ "2 3.17 1185.0 345.96 \n",
+ "3 3.45 1480.0 282.24 \n",
+ "4 2.93 735.0 441.00 \n",
+ "\n",
+ " total_phenols_2 flavanoids_2 hue_2 od280/od315_of_diluted_wines_2 \\\n",
+ "0 7.8400 9.3636 1.0816 15.3664 \n",
+ "1 7.0225 7.6176 1.1025 11.5600 \n",
+ "2 7.8400 10.4976 1.0609 10.0489 \n",
+ "3 14.8225 12.1801 0.7396 11.9025 \n",
+ "4 7.8400 7.2361 1.0816 8.5849 \n",
+ "\n",
+ " proline_2 \n",
+ "0 1134225.0 \n",
+ "1 1102500.0 \n",
+ "2 1404225.0 \n",
+ "3 2190400.0 \n",
+ "4 540225.0 "
+ ]
+ },
+ "execution_count": 56,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "for feature_name in high_corr:\n",
+ " X['{0}_2'.format(feature_name)] = X[feature_name] ** 2\n",
+ "\n",
+ "X.head()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 57,
+ "id": "190f74c0",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " alcohol | \n",
+ " malic_acid | \n",
+ " ash | \n",
+ " alcalinity_of_ash | \n",
+ " magnesium | \n",
+ " total_phenols | \n",
+ " flavanoids | \n",
+ " nonflavanoid_phenols | \n",
+ " proanthocyanins | \n",
+ " color_intensity | \n",
+ " hue | \n",
+ " od280/od315_of_diluted_wines | \n",
+ " proline | \n",
+ " alcalinity_of_ash_2 | \n",
+ " total_phenols_2 | \n",
+ " flavanoids_2 | \n",
+ " hue_2 | \n",
+ " od280/od315_of_diluted_wines_2 | \n",
+ " proline_2 | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | count | \n",
+ " 178.000000 | \n",
+ " 178.000000 | \n",
+ " 178.000000 | \n",
+ " 178.000000 | \n",
+ " 178.000000 | \n",
+ " 178.000000 | \n",
+ " 178.000000 | \n",
+ " 178.000000 | \n",
+ " 178.000000 | \n",
+ " 178.000000 | \n",
+ " 178.000000 | \n",
+ " 178.000000 | \n",
+ " 178.000000 | \n",
+ " 178.000000 | \n",
+ " 178.000000 | \n",
+ " 178.000000 | \n",
+ " 178.000000 | \n",
+ " 178.000000 | \n",
+ " 1.780000e+02 | \n",
+ "
\n",
+ " \n",
+ " | mean | \n",
+ " 13.000618 | \n",
+ " 2.336348 | \n",
+ " 2.366517 | \n",
+ " 19.494944 | \n",
+ " 99.741573 | \n",
+ " 2.295112 | \n",
+ " 2.029270 | \n",
+ " 0.361854 | \n",
+ " 1.590899 | \n",
+ " 5.058090 | \n",
+ " 0.957449 | \n",
+ " 2.611685 | \n",
+ " 746.893258 | \n",
+ " 391.142865 | \n",
+ " 5.657030 | \n",
+ " 5.110049 | \n",
+ " 0.968661 | \n",
+ " 7.322155 | \n",
+ " 6.564591e+05 | \n",
+ "
\n",
+ " \n",
+ " | std | \n",
+ " 0.811827 | \n",
+ " 1.117146 | \n",
+ " 0.274344 | \n",
+ " 3.339564 | \n",
+ " 14.282484 | \n",
+ " 0.625851 | \n",
+ " 0.998859 | \n",
+ " 0.124453 | \n",
+ " 0.572359 | \n",
+ " 2.318286 | \n",
+ " 0.228572 | \n",
+ " 0.709990 | \n",
+ " 314.907474 | \n",
+ " 133.671775 | \n",
+ " 2.936294 | \n",
+ " 4.211441 | \n",
+ " 0.443798 | \n",
+ " 3.584316 | \n",
+ " 5.558591e+05 | \n",
+ "
\n",
+ " \n",
+ " | min | \n",
+ " 11.030000 | \n",
+ " 0.740000 | \n",
+ " 1.360000 | \n",
+ " 10.600000 | \n",
+ " 70.000000 | \n",
+ " 0.980000 | \n",
+ " 0.340000 | \n",
+ " 0.130000 | \n",
+ " 0.410000 | \n",
+ " 1.280000 | \n",
+ " 0.480000 | \n",
+ " 1.270000 | \n",
+ " 278.000000 | \n",
+ " 112.360000 | \n",
+ " 0.960400 | \n",
+ " 0.115600 | \n",
+ " 0.230400 | \n",
+ " 1.612900 | \n",
+ " 7.728400e+04 | \n",
+ "
\n",
+ " \n",
+ " | 25% | \n",
+ " 12.362500 | \n",
+ " 1.602500 | \n",
+ " 2.210000 | \n",
+ " 17.200000 | \n",
+ " 88.000000 | \n",
+ " 1.742500 | \n",
+ " 1.205000 | \n",
+ " 0.270000 | \n",
+ " 1.250000 | \n",
+ " 3.220000 | \n",
+ " 0.782500 | \n",
+ " 1.937500 | \n",
+ " 500.500000 | \n",
+ " 295.840000 | \n",
+ " 3.036325 | \n",
+ " 1.452100 | \n",
+ " 0.612325 | \n",
+ " 3.754075 | \n",
+ " 2.505010e+05 | \n",
+ "
\n",
+ " \n",
+ " | 50% | \n",
+ " 13.050000 | \n",
+ " 1.865000 | \n",
+ " 2.360000 | \n",
+ " 19.500000 | \n",
+ " 98.000000 | \n",
+ " 2.355000 | \n",
+ " 2.135000 | \n",
+ " 0.340000 | \n",
+ " 1.555000 | \n",
+ " 4.690000 | \n",
+ " 0.965000 | \n",
+ " 2.780000 | \n",
+ " 673.500000 | \n",
+ " 380.250000 | \n",
+ " 5.546050 | \n",
+ " 4.558250 | \n",
+ " 0.931250 | \n",
+ " 7.728400 | \n",
+ " 4.536045e+05 | \n",
+ "
\n",
+ " \n",
+ " | 75% | \n",
+ " 13.677500 | \n",
+ " 3.082500 | \n",
+ " 2.557500 | \n",
+ " 21.500000 | \n",
+ " 107.000000 | \n",
+ " 2.800000 | \n",
+ " 2.875000 | \n",
+ " 0.437500 | \n",
+ " 1.950000 | \n",
+ " 6.200000 | \n",
+ " 1.120000 | \n",
+ " 3.170000 | \n",
+ " 985.000000 | \n",
+ " 462.250000 | \n",
+ " 7.840000 | \n",
+ " 8.265700 | \n",
+ " 1.254400 | \n",
+ " 10.048900 | \n",
+ " 9.702250e+05 | \n",
+ "
\n",
+ " \n",
+ " | max | \n",
+ " 14.830000 | \n",
+ " 5.800000 | \n",
+ " 3.230000 | \n",
+ " 30.000000 | \n",
+ " 162.000000 | \n",
+ " 3.880000 | \n",
+ " 5.080000 | \n",
+ " 0.660000 | \n",
+ " 3.580000 | \n",
+ " 13.000000 | \n",
+ " 1.710000 | \n",
+ " 4.000000 | \n",
+ " 1680.000000 | \n",
+ " 900.000000 | \n",
+ " 15.054400 | \n",
+ " 25.806400 | \n",
+ " 2.924100 | \n",
+ " 16.000000 | \n",
+ " 2.822400e+06 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " alcohol malic_acid ash alcalinity_of_ash magnesium \\\n",
+ "count 178.000000 178.000000 178.000000 178.000000 178.000000 \n",
+ "mean 13.000618 2.336348 2.366517 19.494944 99.741573 \n",
+ "std 0.811827 1.117146 0.274344 3.339564 14.282484 \n",
+ "min 11.030000 0.740000 1.360000 10.600000 70.000000 \n",
+ "25% 12.362500 1.602500 2.210000 17.200000 88.000000 \n",
+ "50% 13.050000 1.865000 2.360000 19.500000 98.000000 \n",
+ "75% 13.677500 3.082500 2.557500 21.500000 107.000000 \n",
+ "max 14.830000 5.800000 3.230000 30.000000 162.000000 \n",
+ "\n",
+ " total_phenols flavanoids nonflavanoid_phenols proanthocyanins \\\n",
+ "count 178.000000 178.000000 178.000000 178.000000 \n",
+ "mean 2.295112 2.029270 0.361854 1.590899 \n",
+ "std 0.625851 0.998859 0.124453 0.572359 \n",
+ "min 0.980000 0.340000 0.130000 0.410000 \n",
+ "25% 1.742500 1.205000 0.270000 1.250000 \n",
+ "50% 2.355000 2.135000 0.340000 1.555000 \n",
+ "75% 2.800000 2.875000 0.437500 1.950000 \n",
+ "max 3.880000 5.080000 0.660000 3.580000 \n",
+ "\n",
+ " color_intensity hue od280/od315_of_diluted_wines proline \\\n",
+ "count 178.000000 178.000000 178.000000 178.000000 \n",
+ "mean 5.058090 0.957449 2.611685 746.893258 \n",
+ "std 2.318286 0.228572 0.709990 314.907474 \n",
+ "min 1.280000 0.480000 1.270000 278.000000 \n",
+ "25% 3.220000 0.782500 1.937500 500.500000 \n",
+ "50% 4.690000 0.965000 2.780000 673.500000 \n",
+ "75% 6.200000 1.120000 3.170000 985.000000 \n",
+ "max 13.000000 1.710000 4.000000 1680.000000 \n",
+ "\n",
+ " alcalinity_of_ash_2 total_phenols_2 flavanoids_2 hue_2 \\\n",
+ "count 178.000000 178.000000 178.000000 178.000000 \n",
+ "mean 391.142865 5.657030 5.110049 0.968661 \n",
+ "std 133.671775 2.936294 4.211441 0.443798 \n",
+ "min 112.360000 0.960400 0.115600 0.230400 \n",
+ "25% 295.840000 3.036325 1.452100 0.612325 \n",
+ "50% 380.250000 5.546050 4.558250 0.931250 \n",
+ "75% 462.250000 7.840000 8.265700 1.254400 \n",
+ "max 900.000000 15.054400 25.806400 2.924100 \n",
+ "\n",
+ " od280/od315_of_diluted_wines_2 proline_2 \n",
+ "count 178.000000 1.780000e+02 \n",
+ "mean 7.322155 6.564591e+05 \n",
+ "std 3.584316 5.558591e+05 \n",
+ "min 1.612900 7.728400e+04 \n",
+ "25% 3.754075 2.505010e+05 \n",
+ "50% 7.728400 4.536045e+05 \n",
+ "75% 10.048900 9.702250e+05 \n",
+ "max 16.000000 2.822400e+06 "
+ ]
+ },
+ "execution_count": 57,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "X.describe()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "97f44af7",
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python 3 (ipykernel)",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.9.12"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}