diff --git a/dz_les_6.ipynb b/dz_les_6.ipynb new file mode 100644 index 0000000..1596a44 --- /dev/null +++ b/dz_les_6.ipynb @@ -0,0 +1,3982 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "4175923c", + "metadata": {}, + "source": [ + "# Тема “Обучение с учителем”" + ] + }, + { + "cell_type": "markdown", + "id": "f0f29a5a", + "metadata": {}, + "source": [ + "## Задание 1\n", + "Импортируйте библиотеки pandas и numpy.\n", + "\n", + "Загрузите \"Boston House Prices dataset\" из встроенных наборов данных библиотеки sklearn. Создайте датафреймы X и y из этих данных.\n", + "\n", + "Разбейте эти датафреймы на тренировочные (X_train, y_train) и тестовые (X_test, y_test) с помощью функции train_test_split так, чтобы размер тестовой выборки составлял 30% от всех данных, при этом аргумент random_state должен быть равен 42.\n", + "\n", + "Создайте модель линейной регрессии под названием lr с помощью класса LinearRegression из модуля sklearn.linear_model.\n", + "\n", + "Обучите модель на тренировочных данных (используйте все признаки) и сделайте предсказание на тестовых.\n", + "\n", + "Вычислите R2 полученных предказаний с помощью r2_score из модуля sklearn.metrics." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "144a1b12", + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "import numpy as np" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "4610fdd9", + "metadata": {}, + "outputs": [], + "source": [ + "from sklearn.datasets import load_boston" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "e5e02b98", + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "import warnings\n", + "from sklearn.datasets import load_boston\n", + "with warnings.catch_warnings():\n", + " # You should probably not use this dataset.\n", + " warnings.filterwarnings(\"ignore\")\n", + " boston = load_boston()\n", + "data = boston[\"data\"]" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "f011bce3", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
CRIMZNINDUSCHASNOXRMAGEDISRADTAXPTRATIOBLSTAT
00.0063218.02.310.00.5386.57565.24.09001.0296.015.3396.904.98
10.027310.07.070.00.4696.42178.94.96712.0242.017.8396.909.14
20.027290.07.070.00.4697.18561.14.96712.0242.017.8392.834.03
30.032370.02.180.00.4586.99845.86.06223.0222.018.7394.632.94
40.069050.02.180.00.4587.14754.26.06223.0222.018.7396.905.33
\n", + "
" + ], + "text/plain": [ + " CRIM ZN INDUS CHAS NOX RM AGE DIS RAD TAX \\\n", + "0 0.00632 18.0 2.31 0.0 0.538 6.575 65.2 4.0900 1.0 296.0 \n", + "1 0.02731 0.0 7.07 0.0 0.469 6.421 78.9 4.9671 2.0 242.0 \n", + "2 0.02729 0.0 7.07 0.0 0.469 7.185 61.1 4.9671 2.0 242.0 \n", + "3 0.03237 0.0 2.18 0.0 0.458 6.998 45.8 6.0622 3.0 222.0 \n", + "4 0.06905 0.0 2.18 0.0 0.458 7.147 54.2 6.0622 3.0 222.0 \n", + "\n", + " PTRATIO B LSTAT \n", + "0 15.3 396.90 4.98 \n", + "1 17.8 396.90 9.14 \n", + "2 17.8 392.83 4.03 \n", + "3 18.7 394.63 2.94 \n", + "4 18.7 396.90 5.33 " + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "feature_names = boston[\"feature_names\"]\n", + "\n", + "X = pd.DataFrame(data, columns=feature_names)\n", + "X.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "784f65ff", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
price
024.0
121.6
234.7
333.4
436.2
\n", + "
" + ], + "text/plain": [ + " price\n", + "0 24.0\n", + "1 21.6\n", + "2 34.7\n", + "3 33.4\n", + "4 36.2" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "target = boston[\"target\"]\n", + "\n", + "Y = pd.DataFrame(target, columns=[\"price\"])\n", + "Y.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "8fa75819", + "metadata": {}, + "outputs": [], + "source": [ + "from sklearn.model_selection import train_test_split" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "9fd6b218", + "metadata": {}, + "outputs": [], + "source": [ + "X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.30, random_state=42)" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "354dab43", + "metadata": {}, + "outputs": [], + "source": [ + "from sklearn.linear_model import LinearRegression" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "9edf6a7d", + "metadata": {}, + "outputs": [], + "source": [ + "lr = LinearRegression()" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "c02104fc", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "LinearRegression()" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "lr.fit(X_train, Y_train)" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "70091549", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Y_testY_pred_lr
17323.628.648960
27432.436.495014
49113.615.411193
7222.825.403213
45216.118.855280
\n", + "
" + ], + "text/plain": [ + " Y_test Y_pred_lr\n", + "173 23.6 28.648960\n", + "274 32.4 36.495014\n", + "491 13.6 15.411193\n", + "72 22.8 25.403213\n", + "452 16.1 18.855280" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "y_pred_lr = lr.predict(X_test)\n", + "check_test_lr = pd.DataFrame({\n", + " \"Y_test\": Y_test[\"price\"], \n", + " \"Y_pred_lr\": y_pred_lr.flatten()})\n", + "\n", + "check_test_lr.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "4aa02217", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "21.517444231176995\n" + ] + } + ], + "source": [ + "from sklearn.metrics import mean_squared_error\n", + "\n", + "mean_squared_error_lr = mean_squared_error(check_test_lr[\"Y_pred_lr\"], check_test_lr[\"Y_test\"])\n", + "print(mean_squared_error_lr)" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "649f415f", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0.711226005748496" + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from sklearn.metrics import r2_score\n", + "\n", + "r2_score(Y_test, y_pred_lr)" + ] + }, + { + "cell_type": "markdown", + "id": "7055f50e", + "metadata": {}, + "source": [ + "## Задание 2\n", + "\n", + "Создайте модель под названием model с помощью RandomForestRegressor из модуля sklearn.ensemble.\n", + "\n", + "Сделайте агрумент n_estimators равным 1000, max_depth должен быть равен 12 и random_state сделайте равным 42.\n", + "\n", + "Обучите модель на тренировочных данных аналогично тому, как вы обучали модель LinearRegression,\n", + "но при этом в метод fit вместо датафрейма y_train поставьте y_train.values[:, 0],\n", + "чтобы получить из датафрейма одномерный массив Numpy,\n", + "так как для класса RandomForestRegressor в данном методе для аргумента y предпочтительно применение массивов вместо датафрейма.\n", + "\n", + "Сделайте предсказание на тестовых данных и посчитайте R2.\n", + "\n", + "Сравните с результатом из предыдущего задания. Напишите в комментариях к коду, какая модель в данном случае работает лучше." + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "id": "c22ff6c5", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "RandomForestRegressor(max_depth=12, n_estimators=1000, random_state=42)" + ] + }, + "execution_count": 14, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from sklearn.ensemble import RandomForestRegressor\n", + "model = RandomForestRegressor(n_estimators=1000, max_depth=12, random_state=42)\n", + "model.fit(X_train, Y_train.values[:, 0])" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "id": "1b75bde2", + "metadata": {}, + "outputs": [], + "source": [ + "y_pred_1 = model.predict(X_test)" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "id": "a2c36214", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0.87472606157312" + ] + }, + "execution_count": 16, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "r2_score(Y_test, y_pred_1)" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "id": "f763cd0f", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
y_testy_pred_lry_pred_rf
17323.628.64896022.806412
27432.436.49501431.131464
49113.615.41119316.339125
7222.825.40321323.810726
45216.118.85528017.139521
7620.023.14668921.832284
31617.817.39212419.895747
14014.014.07859914.754118
47119.623.03692721.240835
50016.820.59943320.898658
\n", + "
" + ], + "text/plain": [ + " y_test y_pred_lr y_pred_rf\n", + "173 23.6 28.648960 22.806412\n", + "274 32.4 36.495014 31.131464\n", + "491 13.6 15.411193 16.339125\n", + "72 22.8 25.403213 23.810726\n", + "452 16.1 18.855280 17.139521\n", + "76 20.0 23.146689 21.832284\n", + "316 17.8 17.392124 19.895747\n", + "140 14.0 14.078599 14.754118\n", + "471 19.6 23.036927 21.240835\n", + "500 16.8 20.599433 20.898658" + ] + }, + "execution_count": 17, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "check_test = pd.DataFrame({\n", + " \"y_test\": Y_test[\"price\"],\n", + " \"y_pred_lr\": y_pred_lr.flatten(),\n", + " \"y_pred_rf\": y_pred_1.flatten(),\n", + "})\n", + "\n", + "check_test.head(10)" + ] + }, + { + "cell_type": "markdown", + "id": "e99bedf6", + "metadata": {}, + "source": [ + "R2 из первого задания меньше чем R2 во втором задании, а значить у модели построеной с помощью RandomForestRegressor предсказания ближе к тестовым." + ] + }, + { + "cell_type": "markdown", + "id": "5058d422", + "metadata": {}, + "source": [ + "## *Задание 3\n", + "Вызовите документацию для класса RandomForestRegressor, найдите информацию об атрибуте feature_importances_.\n", + "\n", + "С помощью этого атрибута найдите сумму всех показателей важности, установите, какие два признака показывают наибольшую важность." + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "id": "85115658", + "metadata": {}, + "outputs": [], + "source": [ + "?RandomForestRegressor" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "id": "bdd498f9", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[0.03167574 0.00154252 0.00713813 0.00123624 0.01426897 0.40268179\n", + " 0.01429864 0.06397257 0.00528122 0.01152493 0.01808108 0.01245085\n", + " 0.41584732]\n" + ] + } + ], + "source": [ + "print(model.feature_importances_)" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "id": "72405ef3", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
feature_importancename
00.031676CRIM
10.001543ZN
20.007138INDUS
30.001236CHAS
40.014269NOX
50.402682RM
60.014299AGE
70.063973DIS
80.005281RAD
90.011525TAX
100.018081PTRATIO
110.012451B
120.415847LSTAT
\n", + "
" + ], + "text/plain": [ + " feature_importance name\n", + "0 0.031676 CRIM\n", + "1 0.001543 ZN\n", + "2 0.007138 INDUS\n", + "3 0.001236 CHAS\n", + "4 0.014269 NOX\n", + "5 0.402682 RM\n", + "6 0.014299 AGE\n", + "7 0.063973 DIS\n", + "8 0.005281 RAD\n", + "9 0.011525 TAX\n", + "10 0.018081 PTRATIO\n", + "11 0.012451 B\n", + "12 0.415847 LSTAT" + ] + }, + "execution_count": 20, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "feature_importance = pd.DataFrame({'name':X.columns, \n", + " 'feature_importance':model.feature_importances_}, \n", + " columns=['feature_importance', 'name'])\n", + "feature_importance" + ] + }, + { + "cell_type": "markdown", + "id": "76f250dd", + "metadata": {}, + "source": [ + "Два признака показываюoие наибольшую важность:" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "id": "f94c95ec", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
feature_importancename
120.415847LSTAT
50.402682RM
\n", + "
" + ], + "text/plain": [ + " feature_importance name\n", + "12 0.415847 LSTAT\n", + "5 0.402682 RM" + ] + }, + "execution_count": 21, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "feature_importance.nlargest(2, 'feature_importance')" + ] + }, + { + "cell_type": "markdown", + "id": "26137a5b", + "metadata": {}, + "source": [ + "Сумма показателей важности:" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "id": "dcb2761a", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "1.0" + ] + }, + "execution_count": 22, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "model.feature_importances_.sum()" + ] + }, + { + "cell_type": "markdown", + "id": "17b4ea43", + "metadata": {}, + "source": [ + "## *Задание 4\n", + "\n", + "В этом задании мы будем работать с датасетом, с которым мы уже знакомы по домашнему заданию по библиотеке Matplotlib, это датасет Credit Card Fraud Detection.Для этого датасета мы будем решать задачу классификации - будем определять,какие из транзакциции по кредитной карте являются мошенническими.Данный датасет сильно несбалансирован (так как случаи мошенничества относительно редки),так что применение метрики accuracy не принесет пользы и не поможет выбрать лучшую модель.Мы будем вычислять AUC, то есть площадь под кривой ROC.\n", + "\n", + "Импортируйте из соответствующих модулей RandomForestClassifier, GridSearchCV и train_test_split.\n", + "\n", + "Загрузите датасет creditcard.csv и создайте датафрейм df.\n", + "\n", + "С помощью метода value_counts с аргументом normalize=True убедитесь в том, что выборка несбалансирована.Используя метод info, проверьте, все ли столбцы содержат числовые данные и нет ли в них пропусков. Примените следующую настройку, чтобы можно было просматривать все столбцы датафрейма: pd.options.display.max_columns = 100.\n", + "\n", + "Просмотрите первые 10 строк датафрейма df.\n", + "\n", + "Создайте датафрейм X из датафрейма df, исключив столбец Class.\n", + "\n", + "Создайте объект Series под названием y из столбца Class.\n", + "\n", + "Разбейте X и y на тренировочный и тестовый наборы данных при помощи функции train_test_split, используя аргументы: test_size=0.3, random_state=100, stratify=y. У вас должны получиться объекты X_train, X_test, y_train и y_test.\n", + "\n", + "Просмотрите информацию о их форме. Для поиска по сетке параметров задайте такие параметры: parameters = [{'n_estimators': [10, 15],'max_features': np.arange(3, 5),'max_depth': np.arange(4, 7)}]\n", + "\n", + "Создайте модель GridSearchCV со следующими аргументами: estimator=RandomForestClassifier(random_state=100), param_grid=parameters, scoring='roc_auc', cv=3.\n", + "\n", + "Обучите модель на тренировочном наборе данных (может занять несколько минут).\n", + "\n", + "Просмотрите параметры лучшей модели с помощью атрибута best_params_.\n", + "\n", + "Предскажите вероятности классов с помощью полученнной модели и метода predict_proba.\n", + "\n", + "Из полученного результата (массив Numpy) выберите столбец с индексом 1 (вероятность класса 1) и запишите в массив y_pred_proba.\n", + "\n", + "Из модуля sklearn.metrics импортируйте метрику roc_auc_score.\n", + "\n", + "Вычислите AUC на тестовых данных и сравните с результатом,полученным на тренировочных данных, используя в качестве аргументовмассивы y_test и y_pred_proba." + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "id": "3a5c8c3c", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
TimeV1V2V3V4V5V6V7V8V9...V21V22V23V24V25V26V27V28AmountClass
00.0-1.359807-0.0727812.5363471.378155-0.3383210.4623880.2395990.0986980.363787...-0.0183070.277838-0.1104740.0669280.128539-0.1891150.133558-0.021053149.620
10.01.1918570.2661510.1664800.4481540.060018-0.082361-0.0788030.085102-0.255425...-0.225775-0.6386720.101288-0.3398460.1671700.125895-0.0089830.0147242.690
21.0-1.358354-1.3401631.7732090.379780-0.5031981.8004990.7914610.247676-1.514654...0.2479980.7716790.909412-0.689281-0.327642-0.139097-0.055353-0.059752378.660
31.0-0.966272-0.1852261.792993-0.863291-0.0103091.2472030.2376090.377436-1.387024...-0.1083000.005274-0.190321-1.1755750.647376-0.2219290.0627230.061458123.500
42.0-1.1582330.8777371.5487180.403034-0.4071930.0959210.592941-0.2705330.817739...-0.0094310.798278-0.1374580.141267-0.2060100.5022920.2194220.21515369.990
52.0-0.4259660.9605231.141109-0.1682520.420987-0.0297280.4762010.260314-0.568671...-0.208254-0.559825-0.026398-0.371427-0.2327940.1059150.2538440.0810803.670
64.01.2296580.1410040.0453711.2026130.1918810.272708-0.0051590.0812130.464960...-0.167716-0.270710-0.154104-0.7800550.750137-0.2572370.0345070.0051684.990
77.0-0.6442691.4179641.074380-0.4921990.9489340.4281181.120631-3.8078640.615375...1.943465-1.0154550.057504-0.649709-0.415267-0.051634-1.206921-1.08533940.800
87.0-0.8942860.286157-0.113192-0.2715262.6695993.7218180.3701450.851084-0.392048...-0.073425-0.268092-0.2042331.0115920.373205-0.3841570.0117470.14240493.200
99.0-0.3382621.1195931.044367-0.2221870.499361-0.2467610.6515830.069539-0.736727...-0.246914-0.633753-0.120794-0.385050-0.0697330.0941990.2462190.0830763.680
\n", + "

10 rows × 31 columns

\n", + "
" + ], + "text/plain": [ + " Time V1 V2 V3 V4 V5 V6 V7 \\\n", + "0 0.0 -1.359807 -0.072781 2.536347 1.378155 -0.338321 0.462388 0.239599 \n", + "1 0.0 1.191857 0.266151 0.166480 0.448154 0.060018 -0.082361 -0.078803 \n", + "2 1.0 -1.358354 -1.340163 1.773209 0.379780 -0.503198 1.800499 0.791461 \n", + "3 1.0 -0.966272 -0.185226 1.792993 -0.863291 -0.010309 1.247203 0.237609 \n", + "4 2.0 -1.158233 0.877737 1.548718 0.403034 -0.407193 0.095921 0.592941 \n", + "5 2.0 -0.425966 0.960523 1.141109 -0.168252 0.420987 -0.029728 0.476201 \n", + "6 4.0 1.229658 0.141004 0.045371 1.202613 0.191881 0.272708 -0.005159 \n", + "7 7.0 -0.644269 1.417964 1.074380 -0.492199 0.948934 0.428118 1.120631 \n", + "8 7.0 -0.894286 0.286157 -0.113192 -0.271526 2.669599 3.721818 0.370145 \n", + "9 9.0 -0.338262 1.119593 1.044367 -0.222187 0.499361 -0.246761 0.651583 \n", + "\n", + " V8 V9 ... V21 V22 V23 V24 V25 \\\n", + "0 0.098698 0.363787 ... -0.018307 0.277838 -0.110474 0.066928 0.128539 \n", + "1 0.085102 -0.255425 ... -0.225775 -0.638672 0.101288 -0.339846 0.167170 \n", + "2 0.247676 -1.514654 ... 0.247998 0.771679 0.909412 -0.689281 -0.327642 \n", + "3 0.377436 -1.387024 ... -0.108300 0.005274 -0.190321 -1.175575 0.647376 \n", + "4 -0.270533 0.817739 ... -0.009431 0.798278 -0.137458 0.141267 -0.206010 \n", + "5 0.260314 -0.568671 ... -0.208254 -0.559825 -0.026398 -0.371427 -0.232794 \n", + "6 0.081213 0.464960 ... -0.167716 -0.270710 -0.154104 -0.780055 0.750137 \n", + "7 -3.807864 0.615375 ... 1.943465 -1.015455 0.057504 -0.649709 -0.415267 \n", + "8 0.851084 -0.392048 ... -0.073425 -0.268092 -0.204233 1.011592 0.373205 \n", + "9 0.069539 -0.736727 ... -0.246914 -0.633753 -0.120794 -0.385050 -0.069733 \n", + "\n", + " V26 V27 V28 Amount Class \n", + "0 -0.189115 0.133558 -0.021053 149.62 0 \n", + "1 0.125895 -0.008983 0.014724 2.69 0 \n", + "2 -0.139097 -0.055353 -0.059752 378.66 0 \n", + "3 -0.221929 0.062723 0.061458 123.50 0 \n", + "4 0.502292 0.219422 0.215153 69.99 0 \n", + "5 0.105915 0.253844 0.081080 3.67 0 \n", + "6 -0.257237 0.034507 0.005168 4.99 0 \n", + "7 -0.051634 -1.206921 -1.085339 40.80 0 \n", + "8 -0.384157 0.011747 0.142404 93.20 0 \n", + "9 0.094199 0.246219 0.083076 3.68 0 \n", + "\n", + "[10 rows x 31 columns]" + ] + }, + "execution_count": 23, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from sklearn.ensemble import RandomForestClassifier\n", + "from sklearn.model_selection import GridSearchCV\n", + "from sklearn.model_selection import train_test_split\n", + "\n", + "df = pd.read_csv('creditcard.csv')\n", + "df.head(10)" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "id": "90ca1499", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0 0.998273\n", + "1 0.001727\n", + "Name: Class, dtype: float64" + ] + }, + "execution_count": 24, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df['Class'].value_counts(normalize=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "id": "a95f6657", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "RangeIndex: 284807 entries, 0 to 284806\n", + "Data columns (total 31 columns):\n", + " # Column Non-Null Count Dtype \n", + "--- ------ -------------- ----- \n", + " 0 Time 284807 non-null float64\n", + " 1 V1 284807 non-null float64\n", + " 2 V2 284807 non-null float64\n", + " 3 V3 284807 non-null float64\n", + " 4 V4 284807 non-null float64\n", + " 5 V5 284807 non-null float64\n", + " 6 V6 284807 non-null float64\n", + " 7 V7 284807 non-null float64\n", + " 8 V8 284807 non-null float64\n", + " 9 V9 284807 non-null float64\n", + " 10 V10 284807 non-null float64\n", + " 11 V11 284807 non-null float64\n", + " 12 V12 284807 non-null float64\n", + " 13 V13 284807 non-null float64\n", + " 14 V14 284807 non-null float64\n", + " 15 V15 284807 non-null float64\n", + " 16 V16 284807 non-null float64\n", + " 17 V17 284807 non-null float64\n", + " 18 V18 284807 non-null float64\n", + " 19 V19 284807 non-null float64\n", + " 20 V20 284807 non-null float64\n", + " 21 V21 284807 non-null float64\n", + " 22 V22 284807 non-null float64\n", + " 23 V23 284807 non-null float64\n", + " 24 V24 284807 non-null float64\n", + " 25 V25 284807 non-null float64\n", + " 26 V26 284807 non-null float64\n", + " 27 V27 284807 non-null float64\n", + " 28 V28 284807 non-null float64\n", + " 29 Amount 284807 non-null float64\n", + " 30 Class 284807 non-null int64 \n", + "dtypes: float64(30), int64(1)\n", + "memory usage: 67.4 MB\n" + ] + } + ], + "source": [ + "df.info()" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "id": "f05fbe3e", + "metadata": {}, + "outputs": [], + "source": [ + "pd.options.display.max_columns=100" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "id": "06bc4272", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
TimeV1V2V3V4V5V6V7V8V9V10V11V12V13V14V15V16V17V18V19V20V21V22V23V24V25V26V27V28AmountClass
00.0-1.359807-0.0727812.5363471.378155-0.3383210.4623880.2395990.0986980.3637870.090794-0.551600-0.617801-0.991390-0.3111691.468177-0.4704010.2079710.0257910.4039930.251412-0.0183070.277838-0.1104740.0669280.128539-0.1891150.133558-0.021053149.620
10.01.1918570.2661510.1664800.4481540.060018-0.082361-0.0788030.085102-0.255425-0.1669741.6127271.0652350.489095-0.1437720.6355580.463917-0.114805-0.183361-0.145783-0.069083-0.225775-0.6386720.101288-0.3398460.1671700.125895-0.0089830.0147242.690
21.0-1.358354-1.3401631.7732090.379780-0.5031981.8004990.7914610.247676-1.5146540.2076430.6245010.0660840.717293-0.1659462.345865-2.8900831.109969-0.121359-2.2618570.5249800.2479980.7716790.909412-0.689281-0.327642-0.139097-0.055353-0.059752378.660
31.0-0.966272-0.1852261.792993-0.863291-0.0103091.2472030.2376090.377436-1.387024-0.054952-0.2264870.1782280.507757-0.287924-0.631418-1.059647-0.6840931.965775-1.232622-0.208038-0.1083000.005274-0.190321-1.1755750.647376-0.2219290.0627230.061458123.500
42.0-1.1582330.8777371.5487180.403034-0.4071930.0959210.592941-0.2705330.8177390.753074-0.8228430.5381961.345852-1.1196700.175121-0.451449-0.237033-0.0381950.8034870.408542-0.0094310.798278-0.1374580.141267-0.2060100.5022920.2194220.21515369.990
52.0-0.4259660.9605231.141109-0.1682520.420987-0.0297280.4762010.260314-0.568671-0.3714071.3412620.359894-0.358091-0.1371340.5176170.401726-0.0581330.068653-0.0331940.084968-0.208254-0.559825-0.026398-0.371427-0.2327940.1059150.2538440.0810803.670
64.01.2296580.1410040.0453711.2026130.1918810.272708-0.0051590.0812130.464960-0.099254-1.416907-0.153826-0.7510630.1673720.050144-0.4435870.002821-0.611987-0.045575-0.219633-0.167716-0.270710-0.154104-0.7800550.750137-0.2572370.0345070.0051684.990
77.0-0.6442691.4179641.074380-0.4921990.9489340.4281181.120631-3.8078640.6153751.249376-0.6194680.2914741.757964-1.3238650.686133-0.076127-1.222127-0.3582220.324505-0.1567421.943465-1.0154550.057504-0.649709-0.415267-0.051634-1.206921-1.08533940.800
87.0-0.8942860.286157-0.113192-0.2715262.6695993.7218180.3701450.851084-0.392048-0.410430-0.705117-0.110452-0.2862540.074355-0.328783-0.210077-0.4997680.1187650.5703280.052736-0.073425-0.268092-0.2042331.0115920.373205-0.3841570.0117470.14240493.200
99.0-0.3382621.1195931.044367-0.2221870.499361-0.2467610.6515830.069539-0.736727-0.3668461.0176140.8363901.006844-0.4435230.1502190.739453-0.5409800.4766770.4517730.203711-0.246914-0.633753-0.120794-0.385050-0.0697330.0941990.2462190.0830763.680
\n", + "
" + ], + "text/plain": [ + " Time V1 V2 V3 V4 V5 V6 V7 \\\n", + "0 0.0 -1.359807 -0.072781 2.536347 1.378155 -0.338321 0.462388 0.239599 \n", + "1 0.0 1.191857 0.266151 0.166480 0.448154 0.060018 -0.082361 -0.078803 \n", + "2 1.0 -1.358354 -1.340163 1.773209 0.379780 -0.503198 1.800499 0.791461 \n", + "3 1.0 -0.966272 -0.185226 1.792993 -0.863291 -0.010309 1.247203 0.237609 \n", + "4 2.0 -1.158233 0.877737 1.548718 0.403034 -0.407193 0.095921 0.592941 \n", + "5 2.0 -0.425966 0.960523 1.141109 -0.168252 0.420987 -0.029728 0.476201 \n", + "6 4.0 1.229658 0.141004 0.045371 1.202613 0.191881 0.272708 -0.005159 \n", + "7 7.0 -0.644269 1.417964 1.074380 -0.492199 0.948934 0.428118 1.120631 \n", + "8 7.0 -0.894286 0.286157 -0.113192 -0.271526 2.669599 3.721818 0.370145 \n", + "9 9.0 -0.338262 1.119593 1.044367 -0.222187 0.499361 -0.246761 0.651583 \n", + "\n", + " V8 V9 V10 V11 V12 V13 V14 \\\n", + "0 0.098698 0.363787 0.090794 -0.551600 -0.617801 -0.991390 -0.311169 \n", + "1 0.085102 -0.255425 -0.166974 1.612727 1.065235 0.489095 -0.143772 \n", + "2 0.247676 -1.514654 0.207643 0.624501 0.066084 0.717293 -0.165946 \n", + "3 0.377436 -1.387024 -0.054952 -0.226487 0.178228 0.507757 -0.287924 \n", + "4 -0.270533 0.817739 0.753074 -0.822843 0.538196 1.345852 -1.119670 \n", + "5 0.260314 -0.568671 -0.371407 1.341262 0.359894 -0.358091 -0.137134 \n", + "6 0.081213 0.464960 -0.099254 -1.416907 -0.153826 -0.751063 0.167372 \n", + "7 -3.807864 0.615375 1.249376 -0.619468 0.291474 1.757964 -1.323865 \n", + "8 0.851084 -0.392048 -0.410430 -0.705117 -0.110452 -0.286254 0.074355 \n", + "9 0.069539 -0.736727 -0.366846 1.017614 0.836390 1.006844 -0.443523 \n", + "\n", + " V15 V16 V17 V18 V19 V20 V21 \\\n", + "0 1.468177 -0.470401 0.207971 0.025791 0.403993 0.251412 -0.018307 \n", + "1 0.635558 0.463917 -0.114805 -0.183361 -0.145783 -0.069083 -0.225775 \n", + "2 2.345865 -2.890083 1.109969 -0.121359 -2.261857 0.524980 0.247998 \n", + "3 -0.631418 -1.059647 -0.684093 1.965775 -1.232622 -0.208038 -0.108300 \n", + "4 0.175121 -0.451449 -0.237033 -0.038195 0.803487 0.408542 -0.009431 \n", + "5 0.517617 0.401726 -0.058133 0.068653 -0.033194 0.084968 -0.208254 \n", + "6 0.050144 -0.443587 0.002821 -0.611987 -0.045575 -0.219633 -0.167716 \n", + "7 0.686133 -0.076127 -1.222127 -0.358222 0.324505 -0.156742 1.943465 \n", + "8 -0.328783 -0.210077 -0.499768 0.118765 0.570328 0.052736 -0.073425 \n", + "9 0.150219 0.739453 -0.540980 0.476677 0.451773 0.203711 -0.246914 \n", + "\n", + " V22 V23 V24 V25 V26 V27 V28 \\\n", + "0 0.277838 -0.110474 0.066928 0.128539 -0.189115 0.133558 -0.021053 \n", + "1 -0.638672 0.101288 -0.339846 0.167170 0.125895 -0.008983 0.014724 \n", + "2 0.771679 0.909412 -0.689281 -0.327642 -0.139097 -0.055353 -0.059752 \n", + "3 0.005274 -0.190321 -1.175575 0.647376 -0.221929 0.062723 0.061458 \n", + "4 0.798278 -0.137458 0.141267 -0.206010 0.502292 0.219422 0.215153 \n", + "5 -0.559825 -0.026398 -0.371427 -0.232794 0.105915 0.253844 0.081080 \n", + "6 -0.270710 -0.154104 -0.780055 0.750137 -0.257237 0.034507 0.005168 \n", + "7 -1.015455 0.057504 -0.649709 -0.415267 -0.051634 -1.206921 -1.085339 \n", + "8 -0.268092 -0.204233 1.011592 0.373205 -0.384157 0.011747 0.142404 \n", + "9 -0.633753 -0.120794 -0.385050 -0.069733 0.094199 0.246219 0.083076 \n", + "\n", + " Amount Class \n", + "0 149.62 0 \n", + "1 2.69 0 \n", + "2 378.66 0 \n", + "3 123.50 0 \n", + "4 69.99 0 \n", + "5 3.67 0 \n", + "6 4.99 0 \n", + "7 40.80 0 \n", + "8 93.20 0 \n", + "9 3.68 0 " + ] + }, + "execution_count": 27, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.head(10)" + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "id": "334f4a8f", + "metadata": {}, + "outputs": [], + "source": [ + "X = df.drop(\"Class\", axis=1)\n", + "y = df[\"Class\"]" + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "id": "e13da9aa", + "metadata": {}, + "outputs": [], + "source": [ + "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=100, stratify=y)" + ] + }, + { + "cell_type": "code", + "execution_count": 30, + "id": "daed9819", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "X_train (199364, 30)\n", + "X_test (85443, 30)\n", + "y_train (199364,)\n", + "y_test (85443,)\n" + ] + } + ], + "source": [ + "print('X_train ', X_train.shape)\n", + "print('X_test ', X_test.shape)\n", + "print('y_train ', y_train.shape)\n", + "print('y_test ', y_test.shape)" + ] + }, + { + "cell_type": "code", + "execution_count": 31, + "id": "d359efaa", + "metadata": {}, + "outputs": [], + "source": [ + "parameters = [{'n_estimators': [10, 15],'max_features': np.arange(3, 5),'max_depth': np.arange(4, 7)}]" + ] + }, + { + "cell_type": "code", + "execution_count": 32, + "id": "fae18bbe", + "metadata": {}, + "outputs": [], + "source": [ + "clf = GridSearchCV(\n", + " estimator=RandomForestClassifier(random_state=100),\n", + " param_grid=parameters,\n", + " scoring='roc_auc',\n", + " cv=3,\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 33, + "id": "424b763d", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "GridSearchCV(cv=3, estimator=RandomForestClassifier(random_state=100),\n", + " param_grid=[{'max_depth': array([4, 5, 6]),\n", + " 'max_features': array([3, 4]),\n", + " 'n_estimators': [10, 15]}],\n", + " scoring='roc_auc')" + ] + }, + "execution_count": 33, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "clf.fit(X_train, y_train)" + ] + }, + { + "cell_type": "code", + "execution_count": 34, + "id": "e2454620", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'max_depth': 6, 'max_features': 3, 'n_estimators': 15}" + ] + }, + "execution_count": 34, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "clf.best_params_" + ] + }, + { + "cell_type": "code", + "execution_count": 35, + "id": "ed7b45ed", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "RandomForestClassifier(max_depth=6, max_features=3, n_estimators=15)" + ] + }, + "execution_count": 35, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "clf = RandomForestClassifier(max_depth=6, max_features=3, n_estimators=15)\n", + "\n", + "clf.fit(X_train, y_train)" + ] + }, + { + "cell_type": "code", + "execution_count": 36, + "id": "6bafbb40", + "metadata": {}, + "outputs": [], + "source": [ + "y_pred = clf.predict_proba(X_test)" + ] + }, + { + "cell_type": "code", + "execution_count": 37, + "id": "1b1439c3", + "metadata": {}, + "outputs": [], + "source": [ + "y_pred_proba = y_pred[:, 1]" + ] + }, + { + "cell_type": "code", + "execution_count": 38, + "id": "0dda4ca7", + "metadata": {}, + "outputs": [], + "source": [ + "from sklearn.metrics import roc_auc_score" + ] + }, + { + "cell_type": "code", + "execution_count": 39, + "id": "46103f29", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0.9476239854368701" + ] + }, + "execution_count": 39, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "roc_auc_score(y_test, y_pred_proba)" + ] + }, + { + "cell_type": "markdown", + "id": "1f00ab31", + "metadata": {}, + "source": [ + "# *Дополнительные задания:" + ] + }, + { + "cell_type": "markdown", + "id": "d82b00e3", + "metadata": {}, + "source": [ + "Загрузите датасет Wine из встроенных датасетов sklearn.datasets с помощью функции load_wine в переменную data." + ] + }, + { + "cell_type": "code", + "execution_count": 40, + "id": "f5589ed0", + "metadata": {}, + "outputs": [], + "source": [ + "from sklearn.datasets import load_wine\n", + "data = load_wine()" + ] + }, + { + "cell_type": "markdown", + "id": "e976c96d", + "metadata": {}, + "source": [ + "Полученный датасет не является датафреймом. Это структура данных, имеющая ключи аналогично словарю. Просмотрите тип данных этой структуры данных и создайте список data_keys, содержащий ее ключи." + ] + }, + { + "cell_type": "code", + "execution_count": 41, + "id": "1abc94de", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " \n", + "\n", + "dict_keys(['data', 'target', 'frame', 'target_names', 'DESCR', 'feature_names'])\n" + ] + } + ], + "source": [ + "print(type(data), '\\n')\n", + "data_keys = data.keys()\n", + "print(data_keys)" + ] + }, + { + "cell_type": "markdown", + "id": "b99920b6", + "metadata": {}, + "source": [ + "Просмотрите данные, описание и названия признаков в датасете. Описание нужно вывести в виде привычного, аккуратно оформленного текста, без обозначений переноса строки, но с самими переносами и т.д" + ] + }, + { + "cell_type": "code", + "execution_count": 42, + "id": "2d21f5c1", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array([[1.423e+01, 1.710e+00, 2.430e+00, ..., 1.040e+00, 3.920e+00,\n", + " 1.065e+03],\n", + " [1.320e+01, 1.780e+00, 2.140e+00, ..., 1.050e+00, 3.400e+00,\n", + " 1.050e+03],\n", + " [1.316e+01, 2.360e+00, 2.670e+00, ..., 1.030e+00, 3.170e+00,\n", + " 1.185e+03],\n", + " ...,\n", + " [1.327e+01, 4.280e+00, 2.260e+00, ..., 5.900e-01, 1.560e+00,\n", + " 8.350e+02],\n", + " [1.317e+01, 2.590e+00, 2.370e+00, ..., 6.000e-01, 1.620e+00,\n", + " 8.400e+02],\n", + " [1.413e+01, 4.100e+00, 2.740e+00, ..., 6.100e-01, 1.600e+00,\n", + " 5.600e+02]])" + ] + }, + "execution_count": 42, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "data.data" + ] + }, + { + "cell_type": "code", + "execution_count": 43, + "id": "17d00b8c", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + ".. _wine_dataset:\n", + "\n", + "Wine recognition dataset\n", + "------------------------\n", + "\n", + "**Data Set Characteristics:**\n", + "\n", + " :Number of Instances: 178 (50 in each of three classes)\n", + " :Number of Attributes: 13 numeric, predictive attributes and the class\n", + " :Attribute Information:\n", + " \t\t- Alcohol\n", + " \t\t- Malic acid\n", + " \t\t- Ash\n", + "\t\t- Alcalinity of ash \n", + " \t\t- Magnesium\n", + "\t\t- Total phenols\n", + " \t\t- Flavanoids\n", + " \t\t- Nonflavanoid phenols\n", + " \t\t- Proanthocyanins\n", + "\t\t- Color intensity\n", + " \t\t- Hue\n", + " \t\t- OD280/OD315 of diluted wines\n", + " \t\t- Proline\n", + "\n", + " - class:\n", + " - class_0\n", + " - class_1\n", + " - class_2\n", + "\t\t\n", + " :Summary Statistics:\n", + " \n", + " ============================= ==== ===== ======= =====\n", + " Min Max Mean SD\n", + " ============================= ==== ===== ======= =====\n", + " Alcohol: 11.0 14.8 13.0 0.8\n", + " Malic Acid: 0.74 5.80 2.34 1.12\n", + " Ash: 1.36 3.23 2.36 0.27\n", + " Alcalinity of Ash: 10.6 30.0 19.5 3.3\n", + " Magnesium: 70.0 162.0 99.7 14.3\n", + " Total Phenols: 0.98 3.88 2.29 0.63\n", + " Flavanoids: 0.34 5.08 2.03 1.00\n", + " Nonflavanoid Phenols: 0.13 0.66 0.36 0.12\n", + " Proanthocyanins: 0.41 3.58 1.59 0.57\n", + " Colour Intensity: 1.3 13.0 5.1 2.3\n", + " Hue: 0.48 1.71 0.96 0.23\n", + " OD280/OD315 of diluted wines: 1.27 4.00 2.61 0.71\n", + " Proline: 278 1680 746 315\n", + " ============================= ==== ===== ======= =====\n", + "\n", + " :Missing Attribute Values: None\n", + " :Class Distribution: class_0 (59), class_1 (71), class_2 (48)\n", + " :Creator: R.A. Fisher\n", + " :Donor: Michael Marshall (MARSHALL%PLU@io.arc.nasa.gov)\n", + " :Date: July, 1988\n", + "\n", + "This is a copy of UCI ML Wine recognition datasets.\n", + "https://archive.ics.uci.edu/ml/machine-learning-databases/wine/wine.data\n", + "\n", + "The data is the results of a chemical analysis of wines grown in the same\n", + "region in Italy by three different cultivators. There are thirteen different\n", + "measurements taken for different constituents found in the three types of\n", + "wine.\n", + "\n", + "Original Owners: \n", + "\n", + "Forina, M. et al, PARVUS - \n", + "An Extendible Package for Data Exploration, Classification and Correlation. \n", + "Institute of Pharmaceutical and Food Analysis and Technologies,\n", + "Via Brigata Salerno, 16147 Genoa, Italy.\n", + "\n", + "Citation:\n", + "\n", + "Lichman, M. (2013). UCI Machine Learning Repository\n", + "[https://archive.ics.uci.edu/ml]. Irvine, CA: University of California,\n", + "School of Information and Computer Science. \n", + "\n", + ".. topic:: References\n", + "\n", + " (1) S. Aeberhard, D. Coomans and O. de Vel, \n", + " Comparison of Classifiers in High Dimensional Settings, \n", + " Tech. Rep. no. 92-02, (1992), Dept. of Computer Science and Dept. of \n", + " Mathematics and Statistics, James Cook University of North Queensland. \n", + " (Also submitted to Technometrics). \n", + "\n", + " The data was used with many others for comparing various \n", + " classifiers. The classes are separable, though only RDA \n", + " has achieved 100% correct classification. \n", + " (RDA : 100%, QDA 99.4%, LDA 98.9%, 1NN 96.1% (z-transformed data)) \n", + " (All results using the leave-one-out technique) \n", + "\n", + " (2) S. Aeberhard, D. Coomans and O. de Vel, \n", + " \"THE CLASSIFICATION PERFORMANCE OF RDA\" \n", + " Tech. Rep. no. 92-01, (1992), Dept. of Computer Science and Dept. of \n", + " Mathematics and Statistics, James Cook University of North Queensland. \n", + " (Also submitted to Journal of Chemometrics).\n", + "\n" + ] + } + ], + "source": [ + "print(data.DESCR)" + ] + }, + { + "cell_type": "code", + "execution_count": 44, + "id": "cc71da75", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['alcohol',\n", + " 'malic_acid',\n", + " 'ash',\n", + " 'alcalinity_of_ash',\n", + " 'magnesium',\n", + " 'total_phenols',\n", + " 'flavanoids',\n", + " 'nonflavanoid_phenols',\n", + " 'proanthocyanins',\n", + " 'color_intensity',\n", + " 'hue',\n", + " 'od280/od315_of_diluted_wines',\n", + " 'proline']" + ] + }, + "execution_count": 44, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "data.feature_names" + ] + }, + { + "cell_type": "markdown", + "id": "06759300", + "metadata": {}, + "source": [ + "Сколько классов содержит целевая переменная датасета? Выведите названия классов." + ] + }, + { + "cell_type": "code", + "execution_count": 45, + "id": "04aebed7", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Количество классов: (3,) \n", + "\n", + "Названия классов: ['class_0' 'class_1' 'class_2']\n" + ] + } + ], + "source": [ + "print('Количество классов: ', np.unique(data[\"target\"]).shape, '\\n')\n", + "print('Названия классов: ',data[\"target_names\"])" + ] + }, + { + "cell_type": "markdown", + "id": "8453a644", + "metadata": {}, + "source": [ + "На основе данных датасета (они содержатся в двумерном массиве Numpy) и названий признаков создайте датафрейм под названием X." + ] + }, + { + "cell_type": "code", + "execution_count": 46, + "id": "32a20538", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
alcoholmalic_acidashalcalinity_of_ashmagnesiumtotal_phenolsflavanoidsnonflavanoid_phenolsproanthocyaninscolor_intensityhueod280/od315_of_diluted_winesproline
014.231.712.4315.6127.02.803.060.282.295.641.043.921065.0
113.201.782.1411.2100.02.652.760.261.284.381.053.401050.0
213.162.362.6718.6101.02.803.240.302.815.681.033.171185.0
314.371.952.5016.8113.03.853.490.242.187.800.863.451480.0
413.242.592.8721.0118.02.802.690.391.824.321.042.93735.0
\n", + "
" + ], + "text/plain": [ + " alcohol malic_acid ash alcalinity_of_ash magnesium total_phenols \\\n", + "0 14.23 1.71 2.43 15.6 127.0 2.80 \n", + "1 13.20 1.78 2.14 11.2 100.0 2.65 \n", + "2 13.16 2.36 2.67 18.6 101.0 2.80 \n", + "3 14.37 1.95 2.50 16.8 113.0 3.85 \n", + "4 13.24 2.59 2.87 21.0 118.0 2.80 \n", + "\n", + " flavanoids nonflavanoid_phenols proanthocyanins color_intensity hue \\\n", + "0 3.06 0.28 2.29 5.64 1.04 \n", + "1 2.76 0.26 1.28 4.38 1.05 \n", + "2 3.24 0.30 2.81 5.68 1.03 \n", + "3 3.49 0.24 2.18 7.80 0.86 \n", + "4 2.69 0.39 1.82 4.32 1.04 \n", + "\n", + " od280/od315_of_diluted_wines proline \n", + "0 3.92 1065.0 \n", + "1 3.40 1050.0 \n", + "2 3.17 1185.0 \n", + "3 3.45 1480.0 \n", + "4 2.93 735.0 " + ] + }, + "execution_count": 46, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "X = pd.DataFrame(data.data, columns=data.feature_names)\n", + "X.head()" + ] + }, + { + "cell_type": "markdown", + "id": "048fd139", + "metadata": {}, + "source": [ + "Выясните размер датафрейма X и установите, имеются ли в нем пропущенные значения." + ] + }, + { + "cell_type": "code", + "execution_count": 47, + "id": "5d4936b9", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(178, 13)" + ] + }, + "execution_count": 47, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "X.shape" + ] + }, + { + "cell_type": "code", + "execution_count": 48, + "id": "25e73807", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "RangeIndex: 178 entries, 0 to 177\n", + "Data columns (total 13 columns):\n", + " # Column Non-Null Count Dtype \n", + "--- ------ -------------- ----- \n", + " 0 alcohol 178 non-null float64\n", + " 1 malic_acid 178 non-null float64\n", + " 2 ash 178 non-null float64\n", + " 3 alcalinity_of_ash 178 non-null float64\n", + " 4 magnesium 178 non-null float64\n", + " 5 total_phenols 178 non-null float64\n", + " 6 flavanoids 178 non-null float64\n", + " 7 nonflavanoid_phenols 178 non-null float64\n", + " 8 proanthocyanins 178 non-null float64\n", + " 9 color_intensity 178 non-null float64\n", + " 10 hue 178 non-null float64\n", + " 11 od280/od315_of_diluted_wines 178 non-null float64\n", + " 12 proline 178 non-null float64\n", + "dtypes: float64(13)\n", + "memory usage: 18.2 KB\n" + ] + } + ], + "source": [ + "X.info()" + ] + }, + { + "cell_type": "code", + "execution_count": 49, + "id": "e4d7b962", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "alcohol 0\n", + "malic_acid 0\n", + "ash 0\n", + "alcalinity_of_ash 0\n", + "magnesium 0\n", + "total_phenols 0\n", + "flavanoids 0\n", + "nonflavanoid_phenols 0\n", + "proanthocyanins 0\n", + "color_intensity 0\n", + "hue 0\n", + "od280/od315_of_diluted_wines 0\n", + "proline 0\n", + "dtype: int64" + ] + }, + "execution_count": 49, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "X.isnull().astype(\"int\").sum()" + ] + }, + { + "cell_type": "markdown", + "id": "c50ece7a", + "metadata": {}, + "source": [ + "Добавьте в датафрейм поле с классами вин в виде чисел, имеющих тип данных numpy.int64. Название поля - 'target'." + ] + }, + { + "cell_type": "code", + "execution_count": 50, + "id": "7a8e7cfa", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "RangeIndex: 178 entries, 0 to 177\n", + "Data columns (total 14 columns):\n", + " # Column Non-Null Count Dtype \n", + "--- ------ -------------- ----- \n", + " 0 alcohol 178 non-null float64\n", + " 1 malic_acid 178 non-null float64\n", + " 2 ash 178 non-null float64\n", + " 3 alcalinity_of_ash 178 non-null float64\n", + " 4 magnesium 178 non-null float64\n", + " 5 total_phenols 178 non-null float64\n", + " 6 flavanoids 178 non-null float64\n", + " 7 nonflavanoid_phenols 178 non-null float64\n", + " 8 proanthocyanins 178 non-null float64\n", + " 9 color_intensity 178 non-null float64\n", + " 10 hue 178 non-null float64\n", + " 11 od280/od315_of_diluted_wines 178 non-null float64\n", + " 12 proline 178 non-null float64\n", + " 13 target 178 non-null int64 \n", + "dtypes: float64(13), int64(1)\n", + "memory usage: 19.6 KB\n" + ] + } + ], + "source": [ + "X[\"target\"]=data[\"target\"].astype(np.int64)\n", + "X.info()" + ] + }, + { + "cell_type": "code", + "execution_count": 51, + "id": "52293807", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
alcoholmalic_acidashalcalinity_of_ashmagnesiumtotal_phenolsflavanoidsnonflavanoid_phenolsproanthocyaninscolor_intensityhueod280/od315_of_diluted_winesprolinetarget
014.231.712.4315.6127.02.803.060.282.295.641.043.921065.00
113.201.782.1411.2100.02.652.760.261.284.381.053.401050.00
213.162.362.6718.6101.02.803.240.302.815.681.033.171185.00
314.371.952.5016.8113.03.853.490.242.187.800.863.451480.00
413.242.592.8721.0118.02.802.690.391.824.321.042.93735.00
\n", + "
" + ], + "text/plain": [ + " alcohol malic_acid ash alcalinity_of_ash magnesium total_phenols \\\n", + "0 14.23 1.71 2.43 15.6 127.0 2.80 \n", + "1 13.20 1.78 2.14 11.2 100.0 2.65 \n", + "2 13.16 2.36 2.67 18.6 101.0 2.80 \n", + "3 14.37 1.95 2.50 16.8 113.0 3.85 \n", + "4 13.24 2.59 2.87 21.0 118.0 2.80 \n", + "\n", + " flavanoids nonflavanoid_phenols proanthocyanins color_intensity hue \\\n", + "0 3.06 0.28 2.29 5.64 1.04 \n", + "1 2.76 0.26 1.28 4.38 1.05 \n", + "2 3.24 0.30 2.81 5.68 1.03 \n", + "3 3.49 0.24 2.18 7.80 0.86 \n", + "4 2.69 0.39 1.82 4.32 1.04 \n", + "\n", + " od280/od315_of_diluted_wines proline target \n", + "0 3.92 1065.0 0 \n", + "1 3.40 1050.0 0 \n", + "2 3.17 1185.0 0 \n", + "3 3.45 1480.0 0 \n", + "4 2.93 735.0 0 " + ] + }, + "execution_count": 51, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "X.head()" + ] + }, + { + "cell_type": "markdown", + "id": "e1dfa403", + "metadata": {}, + "source": [ + "Постройте матрицу корреляций для всех полей X. Дайте полученному датафрейму название X_corr." + ] + }, + { + "cell_type": "code", + "execution_count": 52, + "id": "3e105873", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
alcoholmalic_acidashalcalinity_of_ashmagnesiumtotal_phenolsflavanoidsnonflavanoid_phenolsproanthocyaninscolor_intensityhueod280/od315_of_diluted_winesprolinetarget
alcohol1.0000000.0943970.211545-0.3102350.2707980.2891010.236815-0.1559290.1366980.546364-0.0717470.0723430.643720-0.328222
malic_acid0.0943971.0000000.1640450.288500-0.054575-0.335167-0.4110070.292977-0.2207460.248985-0.561296-0.368710-0.1920110.437776
ash0.2115450.1640451.0000000.4433670.2865870.1289800.1150770.1862300.0096520.258887-0.0746670.0039110.223626-0.049643
alcalinity_of_ash-0.3102350.2885000.4433671.000000-0.083333-0.321113-0.3513700.361922-0.1973270.018732-0.273955-0.276769-0.4405970.517859
magnesium0.270798-0.0545750.286587-0.0833331.0000000.2144010.195784-0.2562940.2364410.1999500.0553980.0660040.393351-0.209179
total_phenols0.289101-0.3351670.128980-0.3211130.2144011.0000000.864564-0.4499350.612413-0.0551360.4336810.6999490.498115-0.719163
flavanoids0.236815-0.4110070.115077-0.3513700.1957840.8645641.000000-0.5379000.652692-0.1723790.5434790.7871940.494193-0.847498
nonflavanoid_phenols-0.1559290.2929770.1862300.361922-0.256294-0.449935-0.5379001.000000-0.3658450.139057-0.262640-0.503270-0.3113850.489109
proanthocyanins0.136698-0.2207460.009652-0.1973270.2364410.6124130.652692-0.3658451.000000-0.0252500.2955440.5190670.330417-0.499130
color_intensity0.5463640.2489850.2588870.0187320.199950-0.055136-0.1723790.139057-0.0252501.000000-0.521813-0.4288150.3161000.265668
hue-0.071747-0.561296-0.074667-0.2739550.0553980.4336810.543479-0.2626400.295544-0.5218131.0000000.5654680.236183-0.617369
od280/od315_of_diluted_wines0.072343-0.3687100.003911-0.2767690.0660040.6999490.787194-0.5032700.519067-0.4288150.5654681.0000000.312761-0.788230
proline0.643720-0.1920110.223626-0.4405970.3933510.4981150.494193-0.3113850.3304170.3161000.2361830.3127611.000000-0.633717
target-0.3282220.437776-0.0496430.517859-0.209179-0.719163-0.8474980.489109-0.4991300.265668-0.617369-0.788230-0.6337171.000000
\n", + "
" + ], + "text/plain": [ + " alcohol malic_acid ash \\\n", + "alcohol 1.000000 0.094397 0.211545 \n", + "malic_acid 0.094397 1.000000 0.164045 \n", + "ash 0.211545 0.164045 1.000000 \n", + "alcalinity_of_ash -0.310235 0.288500 0.443367 \n", + "magnesium 0.270798 -0.054575 0.286587 \n", + "total_phenols 0.289101 -0.335167 0.128980 \n", + "flavanoids 0.236815 -0.411007 0.115077 \n", + "nonflavanoid_phenols -0.155929 0.292977 0.186230 \n", + "proanthocyanins 0.136698 -0.220746 0.009652 \n", + "color_intensity 0.546364 0.248985 0.258887 \n", + "hue -0.071747 -0.561296 -0.074667 \n", + "od280/od315_of_diluted_wines 0.072343 -0.368710 0.003911 \n", + "proline 0.643720 -0.192011 0.223626 \n", + "target -0.328222 0.437776 -0.049643 \n", + "\n", + " alcalinity_of_ash magnesium total_phenols \\\n", + "alcohol -0.310235 0.270798 0.289101 \n", + "malic_acid 0.288500 -0.054575 -0.335167 \n", + "ash 0.443367 0.286587 0.128980 \n", + "alcalinity_of_ash 1.000000 -0.083333 -0.321113 \n", + "magnesium -0.083333 1.000000 0.214401 \n", + "total_phenols -0.321113 0.214401 1.000000 \n", + "flavanoids -0.351370 0.195784 0.864564 \n", + "nonflavanoid_phenols 0.361922 -0.256294 -0.449935 \n", + "proanthocyanins -0.197327 0.236441 0.612413 \n", + "color_intensity 0.018732 0.199950 -0.055136 \n", + "hue -0.273955 0.055398 0.433681 \n", + "od280/od315_of_diluted_wines -0.276769 0.066004 0.699949 \n", + "proline -0.440597 0.393351 0.498115 \n", + "target 0.517859 -0.209179 -0.719163 \n", + "\n", + " flavanoids nonflavanoid_phenols \\\n", + "alcohol 0.236815 -0.155929 \n", + "malic_acid -0.411007 0.292977 \n", + "ash 0.115077 0.186230 \n", + "alcalinity_of_ash -0.351370 0.361922 \n", + "magnesium 0.195784 -0.256294 \n", + "total_phenols 0.864564 -0.449935 \n", + "flavanoids 1.000000 -0.537900 \n", + "nonflavanoid_phenols -0.537900 1.000000 \n", + "proanthocyanins 0.652692 -0.365845 \n", + "color_intensity -0.172379 0.139057 \n", + "hue 0.543479 -0.262640 \n", + "od280/od315_of_diluted_wines 0.787194 -0.503270 \n", + "proline 0.494193 -0.311385 \n", + "target -0.847498 0.489109 \n", + "\n", + " proanthocyanins color_intensity hue \\\n", + "alcohol 0.136698 0.546364 -0.071747 \n", + "malic_acid -0.220746 0.248985 -0.561296 \n", + "ash 0.009652 0.258887 -0.074667 \n", + "alcalinity_of_ash -0.197327 0.018732 -0.273955 \n", + "magnesium 0.236441 0.199950 0.055398 \n", + "total_phenols 0.612413 -0.055136 0.433681 \n", + "flavanoids 0.652692 -0.172379 0.543479 \n", + "nonflavanoid_phenols -0.365845 0.139057 -0.262640 \n", + "proanthocyanins 1.000000 -0.025250 0.295544 \n", + "color_intensity -0.025250 1.000000 -0.521813 \n", + "hue 0.295544 -0.521813 1.000000 \n", + "od280/od315_of_diluted_wines 0.519067 -0.428815 0.565468 \n", + "proline 0.330417 0.316100 0.236183 \n", + "target -0.499130 0.265668 -0.617369 \n", + "\n", + " od280/od315_of_diluted_wines proline target \n", + "alcohol 0.072343 0.643720 -0.328222 \n", + "malic_acid -0.368710 -0.192011 0.437776 \n", + "ash 0.003911 0.223626 -0.049643 \n", + "alcalinity_of_ash -0.276769 -0.440597 0.517859 \n", + "magnesium 0.066004 0.393351 -0.209179 \n", + "total_phenols 0.699949 0.498115 -0.719163 \n", + "flavanoids 0.787194 0.494193 -0.847498 \n", + "nonflavanoid_phenols -0.503270 -0.311385 0.489109 \n", + "proanthocyanins 0.519067 0.330417 -0.499130 \n", + "color_intensity -0.428815 0.316100 0.265668 \n", + "hue 0.565468 0.236183 -0.617369 \n", + "od280/od315_of_diluted_wines 1.000000 0.312761 -0.788230 \n", + "proline 0.312761 1.000000 -0.633717 \n", + "target -0.788230 -0.633717 1.000000 " + ] + }, + "execution_count": 52, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "X_corr = X.corr()\n", + "X_corr" + ] + }, + { + "cell_type": "markdown", + "id": "0e9090b9", + "metadata": {}, + "source": [ + "Создайте список high_corr из признаков, корреляция которых с полем target по абсолютному значению превышает 0.5 (причем, само поле target не должно входить в этот список)." + ] + }, + { + "cell_type": "code", + "execution_count": 53, + "id": "68e0627f", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Index(['alcalinity_of_ash', 'total_phenols', 'flavanoids', 'hue',\n", + " 'od280/od315_of_diluted_wines', 'proline'],\n", + " dtype='object')" + ] + }, + "execution_count": 53, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "high_corr = X_corr.loc[(X_corr.index != 'target') & (abs(X_corr['target']) > .5), X_corr.columns != 'target'].index\n", + "high_corr" + ] + }, + { + "cell_type": "markdown", + "id": "94e753ed", + "metadata": {}, + "source": [ + "Удалите из датафрейма X поле с целевой переменной. Для всех признаков, названия которых содержатся в списке high_corr, вычислите квадрат их значений и добавьте в датафрейм X соответствующие поля с суффиксом '_2', добавленного к первоначальному названию признака. Итоговый датафрейм должен содержать все поля, которые, были в нем изначально, а также поля с признаками из списка high_corr, возведенными в квадрат. Выведите описание полей датафрейма X с помощью метода describe." + ] + }, + { + "cell_type": "code", + "execution_count": 54, + "id": "5fa1f170", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
alcoholmalic_acidashalcalinity_of_ashmagnesiumtotal_phenolsflavanoidsnonflavanoid_phenolsproanthocyaninscolor_intensityhueod280/od315_of_diluted_winesproline
014.231.712.4315.6127.02.803.060.282.295.641.043.921065.0
113.201.782.1411.2100.02.652.760.261.284.381.053.401050.0
213.162.362.6718.6101.02.803.240.302.815.681.033.171185.0
314.371.952.5016.8113.03.853.490.242.187.800.863.451480.0
413.242.592.8721.0118.02.802.690.391.824.321.042.93735.0
\n", + "
" + ], + "text/plain": [ + " alcohol malic_acid ash alcalinity_of_ash magnesium total_phenols \\\n", + "0 14.23 1.71 2.43 15.6 127.0 2.80 \n", + "1 13.20 1.78 2.14 11.2 100.0 2.65 \n", + "2 13.16 2.36 2.67 18.6 101.0 2.80 \n", + "3 14.37 1.95 2.50 16.8 113.0 3.85 \n", + "4 13.24 2.59 2.87 21.0 118.0 2.80 \n", + "\n", + " flavanoids nonflavanoid_phenols proanthocyanins color_intensity hue \\\n", + "0 3.06 0.28 2.29 5.64 1.04 \n", + "1 2.76 0.26 1.28 4.38 1.05 \n", + "2 3.24 0.30 2.81 5.68 1.03 \n", + "3 3.49 0.24 2.18 7.80 0.86 \n", + "4 2.69 0.39 1.82 4.32 1.04 \n", + "\n", + " od280/od315_of_diluted_wines proline \n", + "0 3.92 1065.0 \n", + "1 3.40 1050.0 \n", + "2 3.17 1185.0 \n", + "3 3.45 1480.0 \n", + "4 2.93 735.0 " + ] + }, + "execution_count": 54, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "X = X.drop('target', axis=1)\n", + "X.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 56, + "id": "0c3302c5", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
alcoholmalic_acidashalcalinity_of_ashmagnesiumtotal_phenolsflavanoidsnonflavanoid_phenolsproanthocyaninscolor_intensityhueod280/od315_of_diluted_winesprolinealcalinity_of_ash_2total_phenols_2flavanoids_2hue_2od280/od315_of_diluted_wines_2proline_2
014.231.712.4315.6127.02.803.060.282.295.641.043.921065.0243.367.84009.36361.081615.36641134225.0
113.201.782.1411.2100.02.652.760.261.284.381.053.401050.0125.447.02257.61761.102511.56001102500.0
213.162.362.6718.6101.02.803.240.302.815.681.033.171185.0345.967.840010.49761.060910.04891404225.0
314.371.952.5016.8113.03.853.490.242.187.800.863.451480.0282.2414.822512.18010.739611.90252190400.0
413.242.592.8721.0118.02.802.690.391.824.321.042.93735.0441.007.84007.23611.08168.5849540225.0
\n", + "
" + ], + "text/plain": [ + " alcohol malic_acid ash alcalinity_of_ash magnesium total_phenols \\\n", + "0 14.23 1.71 2.43 15.6 127.0 2.80 \n", + "1 13.20 1.78 2.14 11.2 100.0 2.65 \n", + "2 13.16 2.36 2.67 18.6 101.0 2.80 \n", + "3 14.37 1.95 2.50 16.8 113.0 3.85 \n", + "4 13.24 2.59 2.87 21.0 118.0 2.80 \n", + "\n", + " flavanoids nonflavanoid_phenols proanthocyanins color_intensity hue \\\n", + "0 3.06 0.28 2.29 5.64 1.04 \n", + "1 2.76 0.26 1.28 4.38 1.05 \n", + "2 3.24 0.30 2.81 5.68 1.03 \n", + "3 3.49 0.24 2.18 7.80 0.86 \n", + "4 2.69 0.39 1.82 4.32 1.04 \n", + "\n", + " od280/od315_of_diluted_wines proline alcalinity_of_ash_2 \\\n", + "0 3.92 1065.0 243.36 \n", + "1 3.40 1050.0 125.44 \n", + "2 3.17 1185.0 345.96 \n", + "3 3.45 1480.0 282.24 \n", + "4 2.93 735.0 441.00 \n", + "\n", + " total_phenols_2 flavanoids_2 hue_2 od280/od315_of_diluted_wines_2 \\\n", + "0 7.8400 9.3636 1.0816 15.3664 \n", + "1 7.0225 7.6176 1.1025 11.5600 \n", + "2 7.8400 10.4976 1.0609 10.0489 \n", + "3 14.8225 12.1801 0.7396 11.9025 \n", + "4 7.8400 7.2361 1.0816 8.5849 \n", + "\n", + " proline_2 \n", + "0 1134225.0 \n", + "1 1102500.0 \n", + "2 1404225.0 \n", + "3 2190400.0 \n", + "4 540225.0 " + ] + }, + "execution_count": 56, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "for feature_name in high_corr:\n", + " X['{0}_2'.format(feature_name)] = X[feature_name] ** 2\n", + "\n", + "X.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 57, + "id": "acc520cb", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
alcoholmalic_acidashalcalinity_of_ashmagnesiumtotal_phenolsflavanoidsnonflavanoid_phenolsproanthocyaninscolor_intensityhueod280/od315_of_diluted_winesprolinealcalinity_of_ash_2total_phenols_2flavanoids_2hue_2od280/od315_of_diluted_wines_2proline_2
count178.000000178.000000178.000000178.000000178.000000178.000000178.000000178.000000178.000000178.000000178.000000178.000000178.000000178.000000178.000000178.000000178.000000178.0000001.780000e+02
mean13.0006182.3363482.36651719.49494499.7415732.2951122.0292700.3618541.5908995.0580900.9574492.611685746.893258391.1428655.6570305.1100490.9686617.3221556.564591e+05
std0.8118271.1171460.2743443.33956414.2824840.6258510.9988590.1244530.5723592.3182860.2285720.709990314.907474133.6717752.9362944.2114410.4437983.5843165.558591e+05
min11.0300000.7400001.36000010.60000070.0000000.9800000.3400000.1300000.4100001.2800000.4800001.270000278.000000112.3600000.9604000.1156000.2304001.6129007.728400e+04
25%12.3625001.6025002.21000017.20000088.0000001.7425001.2050000.2700001.2500003.2200000.7825001.937500500.500000295.8400003.0363251.4521000.6123253.7540752.505010e+05
50%13.0500001.8650002.36000019.50000098.0000002.3550002.1350000.3400001.5550004.6900000.9650002.780000673.500000380.2500005.5460504.5582500.9312507.7284004.536045e+05
75%13.6775003.0825002.55750021.500000107.0000002.8000002.8750000.4375001.9500006.2000001.1200003.170000985.000000462.2500007.8400008.2657001.25440010.0489009.702250e+05
max14.8300005.8000003.23000030.000000162.0000003.8800005.0800000.6600003.58000013.0000001.7100004.0000001680.000000900.00000015.05440025.8064002.92410016.0000002.822400e+06
\n", + "
" + ], + "text/plain": [ + " alcohol malic_acid ash alcalinity_of_ash magnesium \\\n", + "count 178.000000 178.000000 178.000000 178.000000 178.000000 \n", + "mean 13.000618 2.336348 2.366517 19.494944 99.741573 \n", + "std 0.811827 1.117146 0.274344 3.339564 14.282484 \n", + "min 11.030000 0.740000 1.360000 10.600000 70.000000 \n", + "25% 12.362500 1.602500 2.210000 17.200000 88.000000 \n", + "50% 13.050000 1.865000 2.360000 19.500000 98.000000 \n", + "75% 13.677500 3.082500 2.557500 21.500000 107.000000 \n", + "max 14.830000 5.800000 3.230000 30.000000 162.000000 \n", + "\n", + " total_phenols flavanoids nonflavanoid_phenols proanthocyanins \\\n", + "count 178.000000 178.000000 178.000000 178.000000 \n", + "mean 2.295112 2.029270 0.361854 1.590899 \n", + "std 0.625851 0.998859 0.124453 0.572359 \n", + "min 0.980000 0.340000 0.130000 0.410000 \n", + "25% 1.742500 1.205000 0.270000 1.250000 \n", + "50% 2.355000 2.135000 0.340000 1.555000 \n", + "75% 2.800000 2.875000 0.437500 1.950000 \n", + "max 3.880000 5.080000 0.660000 3.580000 \n", + "\n", + " color_intensity hue od280/od315_of_diluted_wines proline \\\n", + "count 178.000000 178.000000 178.000000 178.000000 \n", + "mean 5.058090 0.957449 2.611685 746.893258 \n", + "std 2.318286 0.228572 0.709990 314.907474 \n", + "min 1.280000 0.480000 1.270000 278.000000 \n", + "25% 3.220000 0.782500 1.937500 500.500000 \n", + "50% 4.690000 0.965000 2.780000 673.500000 \n", + "75% 6.200000 1.120000 3.170000 985.000000 \n", + "max 13.000000 1.710000 4.000000 1680.000000 \n", + "\n", + " alcalinity_of_ash_2 total_phenols_2 flavanoids_2 hue_2 \\\n", + "count 178.000000 178.000000 178.000000 178.000000 \n", + "mean 391.142865 5.657030 5.110049 0.968661 \n", + "std 133.671775 2.936294 4.211441 0.443798 \n", + "min 112.360000 0.960400 0.115600 0.230400 \n", + "25% 295.840000 3.036325 1.452100 0.612325 \n", + "50% 380.250000 5.546050 4.558250 0.931250 \n", + "75% 462.250000 7.840000 8.265700 1.254400 \n", + "max 900.000000 15.054400 25.806400 2.924100 \n", + "\n", + " od280/od315_of_diluted_wines_2 proline_2 \n", + "count 178.000000 1.780000e+02 \n", + "mean 7.322155 6.564591e+05 \n", + "std 3.584316 5.558591e+05 \n", + "min 1.612900 7.728400e+04 \n", + "25% 3.754075 2.505010e+05 \n", + "50% 7.728400 4.536045e+05 \n", + "75% 10.048900 9.702250e+05 \n", + "max 16.000000 2.822400e+06 " + ] + }, + "execution_count": 57, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "X.describe()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2c970707", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.12" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +}