{ "cells": [ { "cell_type": "markdown", "id": "e4f5991e", "metadata": {}, "source": [ "# Тема “Обучение с учителем”" ] }, { "cell_type": "markdown", "id": "2442aab9", "metadata": {}, "source": [ "## Задание 1\n", "Импортируйте библиотеки pandas и numpy.\n", "\n", "Загрузите \"Boston House Prices dataset\" из встроенных наборов данных библиотеки sklearn. Создайте датафреймы X и y из этих данных.\n", "\n", "Разбейте эти датафреймы на тренировочные (X_train, y_train) и тестовые (X_test, y_test) с помощью функции train_test_split так, чтобы размер тестовой выборки составлял 30% от всех данных, при этом аргумент random_state должен быть равен 42.\n", "\n", "Создайте модель линейной регрессии под названием lr с помощью класса LinearRegression из модуля sklearn.linear_model.\n", "\n", "Обучите модель на тренировочных данных (используйте все признаки) и сделайте предсказание на тестовых.\n", "\n", "Вычислите R2 полученных предказаний с помощью r2_score из модуля sklearn.metrics." ] }, { "cell_type": "code", "execution_count": 1, "id": "f79ac751", "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "import numpy as np" ] }, { "cell_type": "code", "execution_count": 2, "id": "483c687f", "metadata": {}, "outputs": [], "source": [ "from sklearn.datasets import load_boston" ] }, { "cell_type": "code", "execution_count": 3, "id": "fd8d693f", "metadata": { "scrolled": true }, "outputs": [], "source": [ "import warnings\n", "from sklearn.datasets import load_boston\n", "with warnings.catch_warnings():\n", " # You should probably not use this dataset.\n", " warnings.filterwarnings(\"ignore\")\n", " boston = load_boston()\n", "data = boston[\"data\"]" ] }, { "cell_type": "code", "execution_count": 4, "id": "81339e3e", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
CRIMZNINDUSCHASNOXRMAGEDISRADTAXPTRATIOBLSTAT
00.0063218.02.310.00.5386.57565.24.09001.0296.015.3396.904.98
10.027310.07.070.00.4696.42178.94.96712.0242.017.8396.909.14
20.027290.07.070.00.4697.18561.14.96712.0242.017.8392.834.03
30.032370.02.180.00.4586.99845.86.06223.0222.018.7394.632.94
40.069050.02.180.00.4587.14754.26.06223.0222.018.7396.905.33
\n", "
" ], "text/plain": [ " CRIM ZN INDUS CHAS NOX RM AGE DIS RAD TAX \\\n", "0 0.00632 18.0 2.31 0.0 0.538 6.575 65.2 4.0900 1.0 296.0 \n", "1 0.02731 0.0 7.07 0.0 0.469 6.421 78.9 4.9671 2.0 242.0 \n", "2 0.02729 0.0 7.07 0.0 0.469 7.185 61.1 4.9671 2.0 242.0 \n", "3 0.03237 0.0 2.18 0.0 0.458 6.998 45.8 6.0622 3.0 222.0 \n", "4 0.06905 0.0 2.18 0.0 0.458 7.147 54.2 6.0622 3.0 222.0 \n", "\n", " PTRATIO B LSTAT \n", "0 15.3 396.90 4.98 \n", "1 17.8 396.90 9.14 \n", "2 17.8 392.83 4.03 \n", "3 18.7 394.63 2.94 \n", "4 18.7 396.90 5.33 " ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "feature_names = boston[\"feature_names\"]\n", "\n", "X = pd.DataFrame(data, columns=feature_names)\n", "X.head()" ] }, { "cell_type": "code", "execution_count": 5, "id": "0a3b3fbd", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
price
024.0
121.6
234.7
333.4
436.2
\n", "
" ], "text/plain": [ " price\n", "0 24.0\n", "1 21.6\n", "2 34.7\n", "3 33.4\n", "4 36.2" ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "target = boston[\"target\"]\n", "\n", "Y = pd.DataFrame(target, columns=[\"price\"])\n", "Y.head()" ] }, { "cell_type": "code", "execution_count": 6, "id": "81f5f72a", "metadata": {}, "outputs": [], "source": [ "from sklearn.model_selection import train_test_split" ] }, { "cell_type": "code", "execution_count": 7, "id": "eca2e802", "metadata": {}, "outputs": [], "source": [ "X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.30, random_state=42)" ] }, { "cell_type": "code", "execution_count": 8, "id": "b7fdd109", "metadata": {}, "outputs": [], "source": [ "from sklearn.linear_model import LinearRegression" ] }, { "cell_type": "code", "execution_count": 9, "id": "87bbc227", "metadata": {}, "outputs": [], "source": [ "lr = LinearRegression()" ] }, { "cell_type": "code", "execution_count": 10, "id": "41af6442", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "LinearRegression()" ] }, "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ "lr.fit(X_train, Y_train)" ] }, { "cell_type": "code", "execution_count": 11, "id": "28a67c09", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
Y_testY_pred_lr
17323.628.648960
27432.436.495014
49113.615.411193
7222.825.403213
45216.118.855280
\n", "
" ], "text/plain": [ " Y_test Y_pred_lr\n", "173 23.6 28.648960\n", "274 32.4 36.495014\n", "491 13.6 15.411193\n", "72 22.8 25.403213\n", "452 16.1 18.855280" ] }, "execution_count": 11, "metadata": {}, "output_type": "execute_result" } ], "source": [ "y_pred_lr = lr.predict(X_test)\n", "check_test_lr = pd.DataFrame({\n", " \"Y_test\": Y_test[\"price\"], \n", " \"Y_pred_lr\": y_pred_lr.flatten()})\n", "\n", "check_test_lr.head()" ] }, { "cell_type": "code", "execution_count": 12, "id": "4a035a94", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "21.517444231176995\n" ] } ], "source": [ "from sklearn.metrics import mean_squared_error\n", "\n", "mean_squared_error_lr = mean_squared_error(check_test_lr[\"Y_pred_lr\"], check_test_lr[\"Y_test\"])\n", "print(mean_squared_error_lr)" ] }, { "cell_type": "code", "execution_count": 13, "id": "2b87195e", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0.711226005748496" ] }, "execution_count": 13, "metadata": {}, "output_type": "execute_result" } ], "source": [ "from sklearn.metrics import r2_score\n", "\n", "r2_score(Y_test, y_pred_lr)" ] }, { "cell_type": "markdown", "id": "bbf6dc90", "metadata": {}, "source": [ "## Задание 2\n", "\n", "Создайте модель под названием model с помощью RandomForestRegressor из модуля sklearn.ensemble.\n", "\n", "Сделайте агрумент n_estimators равным 1000, max_depth должен быть равен 12 и random_state сделайте равным 42.\n", "\n", "Обучите модель на тренировочных данных аналогично тому, как вы обучали модель LinearRegression,\n", "но при этом в метод fit вместо датафрейма y_train поставьте y_train.values[:, 0],\n", "чтобы получить из датафрейма одномерный массив Numpy,\n", "так как для класса RandomForestRegressor в данном методе для аргумента y предпочтительно применение массивов вместо датафрейма.\n", "\n", "Сделайте предсказание на тестовых данных и посчитайте R2.\n", "\n", "Сравните с результатом из предыдущего задания. Напишите в комментариях к коду, какая модель в данном случае работает лучше." ] }, { "cell_type": "code", "execution_count": 14, "id": "f8f381fd", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "RandomForestRegressor(max_depth=12, n_estimators=1000, random_state=42)" ] }, "execution_count": 14, "metadata": {}, "output_type": "execute_result" } ], "source": [ "from sklearn.ensemble import RandomForestRegressor\n", "model = RandomForestRegressor(n_estimators=1000, max_depth=12, random_state=42)\n", "model.fit(X_train, Y_train.values[:, 0])" ] }, { "cell_type": "code", "execution_count": 15, "id": "c2733e21", "metadata": {}, "outputs": [], "source": [ "y_pred_1 = model.predict(X_test)" ] }, { "cell_type": "code", "execution_count": 16, "id": "ff72edb4", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0.87472606157312" ] }, "execution_count": 16, "metadata": {}, "output_type": "execute_result" } ], "source": [ "r2_score(Y_test, y_pred_1)" ] }, { "cell_type": "code", "execution_count": 17, "id": "24d68924", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
y_testy_pred_lry_pred_rf
17323.628.64896022.806412
27432.436.49501431.131464
49113.615.41119316.339125
7222.825.40321323.810726
45216.118.85528017.139521
7620.023.14668921.832284
31617.817.39212419.895747
14014.014.07859914.754118
47119.623.03692721.240835
50016.820.59943320.898658
\n", "
" ], "text/plain": [ " y_test y_pred_lr y_pred_rf\n", "173 23.6 28.648960 22.806412\n", "274 32.4 36.495014 31.131464\n", "491 13.6 15.411193 16.339125\n", "72 22.8 25.403213 23.810726\n", "452 16.1 18.855280 17.139521\n", "76 20.0 23.146689 21.832284\n", "316 17.8 17.392124 19.895747\n", "140 14.0 14.078599 14.754118\n", "471 19.6 23.036927 21.240835\n", "500 16.8 20.599433 20.898658" ] }, "execution_count": 17, "metadata": {}, "output_type": "execute_result" } ], "source": [ "check_test = pd.DataFrame({\n", " \"y_test\": Y_test[\"price\"],\n", " \"y_pred_lr\": y_pred_lr.flatten(),\n", " \"y_pred_rf\": y_pred_1.flatten(),\n", "})\n", "\n", "check_test.head(10)" ] }, { "cell_type": "markdown", "id": "9f09783f", "metadata": {}, "source": [ "R2 из первого задания меньше чем R2 во втором задании, а значить у модели построеной с помощью RandomForestRegressor предсказания ближе к тестовым." ] }, { "cell_type": "markdown", "id": "a86c9368", "metadata": {}, "source": [ "## *Задание 3\n", "Вызовите документацию для класса RandomForestRegressor, найдите информацию об атрибуте feature_importances_.\n", "\n", "С помощью этого атрибута найдите сумму всех показателей важности, установите, какие два признака показывают наибольшую важность." ] }, { "cell_type": "code", "execution_count": 18, "id": "8acc1978", "metadata": {}, "outputs": [], "source": [ "?RandomForestRegressor" ] }, { "cell_type": "code", "execution_count": 19, "id": "cd674bb4", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[0.03167574 0.00154252 0.00713813 0.00123624 0.01426897 0.40268179\n", " 0.01429864 0.06397257 0.00528122 0.01152493 0.01808108 0.01245085\n", " 0.41584732]\n" ] } ], "source": [ "print(model.feature_importances_)" ] }, { "cell_type": "code", "execution_count": 20, "id": "1e1dbef5", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
feature_importancename
00.031676CRIM
10.001543ZN
20.007138INDUS
30.001236CHAS
40.014269NOX
50.402682RM
60.014299AGE
70.063973DIS
80.005281RAD
90.011525TAX
100.018081PTRATIO
110.012451B
120.415847LSTAT
\n", "
" ], "text/plain": [ " feature_importance name\n", "0 0.031676 CRIM\n", "1 0.001543 ZN\n", "2 0.007138 INDUS\n", "3 0.001236 CHAS\n", "4 0.014269 NOX\n", "5 0.402682 RM\n", "6 0.014299 AGE\n", "7 0.063973 DIS\n", "8 0.005281 RAD\n", "9 0.011525 TAX\n", "10 0.018081 PTRATIO\n", "11 0.012451 B\n", "12 0.415847 LSTAT" ] }, "execution_count": 20, "metadata": {}, "output_type": "execute_result" } ], "source": [ "feature_importance = pd.DataFrame({'name':X.columns, \n", " 'feature_importance':model.feature_importances_}, \n", " columns=['feature_importance', 'name'])\n", "feature_importance" ] }, { "cell_type": "markdown", "id": "3bf5bd9e", "metadata": {}, "source": [ "Два признака показываюoие наибольшую важность:" ] }, { "cell_type": "code", "execution_count": 21, "id": "82439470", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
feature_importancename
120.415847LSTAT
50.402682RM
\n", "
" ], "text/plain": [ " feature_importance name\n", "12 0.415847 LSTAT\n", "5 0.402682 RM" ] }, "execution_count": 21, "metadata": {}, "output_type": "execute_result" } ], "source": [ "feature_importance.nlargest(2, 'feature_importance')" ] }, { "cell_type": "markdown", "id": "9c507bb6", "metadata": {}, "source": [ "Сумма показателей важности:" ] }, { "cell_type": "code", "execution_count": 22, "id": "2aae8d49", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "1.0" ] }, "execution_count": 22, "metadata": {}, "output_type": "execute_result" } ], "source": [ "model.feature_importances_.sum()" ] }, { "cell_type": "markdown", "id": "3e9baeab", "metadata": {}, "source": [ "## *Задание 4\n", "\n", "В этом задании мы будем работать с датасетом, с которым мы уже знакомы по домашнему заданию по библиотеке Matplotlib, это датасет Credit Card Fraud Detection.Для этого датасета мы будем решать задачу классификации - будем определять,какие из транзакциции по кредитной карте являются мошенническими.Данный датасет сильно несбалансирован (так как случаи мошенничества относительно редки),так что применение метрики accuracy не принесет пользы и не поможет выбрать лучшую модель.Мы будем вычислять AUC, то есть площадь под кривой ROC.\n", "\n", "Импортируйте из соответствующих модулей RandomForestClassifier, GridSearchCV и train_test_split.\n", "\n", "Загрузите датасет creditcard.csv и создайте датафрейм df.\n", "\n", "С помощью метода value_counts с аргументом normalize=True убедитесь в том, что выборка несбалансирована.Используя метод info, проверьте, все ли столбцы содержат числовые данные и нет ли в них пропусков. Примените следующую настройку, чтобы можно было просматривать все столбцы датафрейма: pd.options.display.max_columns = 100.\n", "\n", "Просмотрите первые 10 строк датафрейма df.\n", "\n", "Создайте датафрейм X из датафрейма df, исключив столбец Class.\n", "\n", "Создайте объект Series под названием y из столбца Class.\n", "\n", "Разбейте X и y на тренировочный и тестовый наборы данных при помощи функции train_test_split, используя аргументы: test_size=0.3, random_state=100, stratify=y. У вас должны получиться объекты X_train, X_test, y_train и y_test.\n", "\n", "Просмотрите информацию о их форме. Для поиска по сетке параметров задайте такие параметры: parameters = [{'n_estimators': [10, 15],'max_features': np.arange(3, 5),'max_depth': np.arange(4, 7)}]\n", "\n", "Создайте модель GridSearchCV со следующими аргументами: estimator=RandomForestClassifier(random_state=100), param_grid=parameters, scoring='roc_auc', cv=3.\n", "\n", "Обучите модель на тренировочном наборе данных (может занять несколько минут).\n", "\n", "Просмотрите параметры лучшей модели с помощью атрибута best_params_.\n", "\n", "Предскажите вероятности классов с помощью полученнной модели и метода predict_proba.\n", "\n", "Из полученного результата (массив Numpy) выберите столбец с индексом 1 (вероятность класса 1) и запишите в массив y_pred_proba.\n", "\n", "Из модуля sklearn.metrics импортируйте метрику roc_auc_score.\n", "\n", "Вычислите AUC на тестовых данных и сравните с результатом,полученным на тренировочных данных, используя в качестве аргументовмассивы y_test и y_pred_proba." ] }, { "cell_type": "code", "execution_count": 23, "id": "c890d00a", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
TimeV1V2V3V4V5V6V7V8V9...V21V22V23V24V25V26V27V28AmountClass
00.0-1.359807-0.0727812.5363471.378155-0.3383210.4623880.2395990.0986980.363787...-0.0183070.277838-0.1104740.0669280.128539-0.1891150.133558-0.021053149.620
10.01.1918570.2661510.1664800.4481540.060018-0.082361-0.0788030.085102-0.255425...-0.225775-0.6386720.101288-0.3398460.1671700.125895-0.0089830.0147242.690
21.0-1.358354-1.3401631.7732090.379780-0.5031981.8004990.7914610.247676-1.514654...0.2479980.7716790.909412-0.689281-0.327642-0.139097-0.055353-0.059752378.660
31.0-0.966272-0.1852261.792993-0.863291-0.0103091.2472030.2376090.377436-1.387024...-0.1083000.005274-0.190321-1.1755750.647376-0.2219290.0627230.061458123.500
42.0-1.1582330.8777371.5487180.403034-0.4071930.0959210.592941-0.2705330.817739...-0.0094310.798278-0.1374580.141267-0.2060100.5022920.2194220.21515369.990
52.0-0.4259660.9605231.141109-0.1682520.420987-0.0297280.4762010.260314-0.568671...-0.208254-0.559825-0.026398-0.371427-0.2327940.1059150.2538440.0810803.670
64.01.2296580.1410040.0453711.2026130.1918810.272708-0.0051590.0812130.464960...-0.167716-0.270710-0.154104-0.7800550.750137-0.2572370.0345070.0051684.990
77.0-0.6442691.4179641.074380-0.4921990.9489340.4281181.120631-3.8078640.615375...1.943465-1.0154550.057504-0.649709-0.415267-0.051634-1.206921-1.08533940.800
87.0-0.8942860.286157-0.113192-0.2715262.6695993.7218180.3701450.851084-0.392048...-0.073425-0.268092-0.2042331.0115920.373205-0.3841570.0117470.14240493.200
99.0-0.3382621.1195931.044367-0.2221870.499361-0.2467610.6515830.069539-0.736727...-0.246914-0.633753-0.120794-0.385050-0.0697330.0941990.2462190.0830763.680
\n", "

10 rows × 31 columns

\n", "
" ], "text/plain": [ " Time V1 V2 V3 V4 V5 V6 V7 \\\n", "0 0.0 -1.359807 -0.072781 2.536347 1.378155 -0.338321 0.462388 0.239599 \n", "1 0.0 1.191857 0.266151 0.166480 0.448154 0.060018 -0.082361 -0.078803 \n", "2 1.0 -1.358354 -1.340163 1.773209 0.379780 -0.503198 1.800499 0.791461 \n", "3 1.0 -0.966272 -0.185226 1.792993 -0.863291 -0.010309 1.247203 0.237609 \n", "4 2.0 -1.158233 0.877737 1.548718 0.403034 -0.407193 0.095921 0.592941 \n", "5 2.0 -0.425966 0.960523 1.141109 -0.168252 0.420987 -0.029728 0.476201 \n", "6 4.0 1.229658 0.141004 0.045371 1.202613 0.191881 0.272708 -0.005159 \n", "7 7.0 -0.644269 1.417964 1.074380 -0.492199 0.948934 0.428118 1.120631 \n", "8 7.0 -0.894286 0.286157 -0.113192 -0.271526 2.669599 3.721818 0.370145 \n", "9 9.0 -0.338262 1.119593 1.044367 -0.222187 0.499361 -0.246761 0.651583 \n", "\n", " V8 V9 ... V21 V22 V23 V24 V25 \\\n", "0 0.098698 0.363787 ... -0.018307 0.277838 -0.110474 0.066928 0.128539 \n", "1 0.085102 -0.255425 ... -0.225775 -0.638672 0.101288 -0.339846 0.167170 \n", "2 0.247676 -1.514654 ... 0.247998 0.771679 0.909412 -0.689281 -0.327642 \n", "3 0.377436 -1.387024 ... -0.108300 0.005274 -0.190321 -1.175575 0.647376 \n", "4 -0.270533 0.817739 ... -0.009431 0.798278 -0.137458 0.141267 -0.206010 \n", "5 0.260314 -0.568671 ... -0.208254 -0.559825 -0.026398 -0.371427 -0.232794 \n", "6 0.081213 0.464960 ... -0.167716 -0.270710 -0.154104 -0.780055 0.750137 \n", "7 -3.807864 0.615375 ... 1.943465 -1.015455 0.057504 -0.649709 -0.415267 \n", "8 0.851084 -0.392048 ... -0.073425 -0.268092 -0.204233 1.011592 0.373205 \n", "9 0.069539 -0.736727 ... -0.246914 -0.633753 -0.120794 -0.385050 -0.069733 \n", "\n", " V26 V27 V28 Amount Class \n", "0 -0.189115 0.133558 -0.021053 149.62 0 \n", "1 0.125895 -0.008983 0.014724 2.69 0 \n", "2 -0.139097 -0.055353 -0.059752 378.66 0 \n", "3 -0.221929 0.062723 0.061458 123.50 0 \n", "4 0.502292 0.219422 0.215153 69.99 0 \n", "5 0.105915 0.253844 0.081080 3.67 0 \n", "6 -0.257237 0.034507 0.005168 4.99 0 \n", "7 -0.051634 -1.206921 -1.085339 40.80 0 \n", "8 -0.384157 0.011747 0.142404 93.20 0 \n", "9 0.094199 0.246219 0.083076 3.68 0 \n", "\n", "[10 rows x 31 columns]" ] }, "execution_count": 23, "metadata": {}, "output_type": "execute_result" } ], "source": [ "from sklearn.ensemble import RandomForestClassifier\n", "from sklearn.model_selection import GridSearchCV\n", "from sklearn.model_selection import train_test_split\n", "\n", "df = pd.read_csv('creditcard.csv')\n", "df.head(10)" ] }, { "cell_type": "code", "execution_count": 24, "id": "0201a188", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0 0.998273\n", "1 0.001727\n", "Name: Class, dtype: float64" ] }, "execution_count": 24, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df['Class'].value_counts(normalize=True)" ] }, { "cell_type": "code", "execution_count": 25, "id": "9ffa6a96", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "RangeIndex: 284807 entries, 0 to 284806\n", "Data columns (total 31 columns):\n", " # Column Non-Null Count Dtype \n", "--- ------ -------------- ----- \n", " 0 Time 284807 non-null float64\n", " 1 V1 284807 non-null float64\n", " 2 V2 284807 non-null float64\n", " 3 V3 284807 non-null float64\n", " 4 V4 284807 non-null float64\n", " 5 V5 284807 non-null float64\n", " 6 V6 284807 non-null float64\n", " 7 V7 284807 non-null float64\n", " 8 V8 284807 non-null float64\n", " 9 V9 284807 non-null float64\n", " 10 V10 284807 non-null float64\n", " 11 V11 284807 non-null float64\n", " 12 V12 284807 non-null float64\n", " 13 V13 284807 non-null float64\n", " 14 V14 284807 non-null float64\n", " 15 V15 284807 non-null float64\n", " 16 V16 284807 non-null float64\n", " 17 V17 284807 non-null float64\n", " 18 V18 284807 non-null float64\n", " 19 V19 284807 non-null float64\n", " 20 V20 284807 non-null float64\n", " 21 V21 284807 non-null float64\n", " 22 V22 284807 non-null float64\n", " 23 V23 284807 non-null float64\n", " 24 V24 284807 non-null float64\n", " 25 V25 284807 non-null float64\n", " 26 V26 284807 non-null float64\n", " 27 V27 284807 non-null float64\n", " 28 V28 284807 non-null float64\n", " 29 Amount 284807 non-null float64\n", " 30 Class 284807 non-null int64 \n", "dtypes: float64(30), int64(1)\n", "memory usage: 67.4 MB\n" ] } ], "source": [ "df.info()" ] }, { "cell_type": "code", "execution_count": 26, "id": "8d3439e3", "metadata": {}, "outputs": [], "source": [ "pd.options.display.max_columns=100" ] }, { "cell_type": "code", "execution_count": 27, "id": "0f57a690", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
TimeV1V2V3V4V5V6V7V8V9V10V11V12V13V14V15V16V17V18V19V20V21V22V23V24V25V26V27V28AmountClass
00.0-1.359807-0.0727812.5363471.378155-0.3383210.4623880.2395990.0986980.3637870.090794-0.551600-0.617801-0.991390-0.3111691.468177-0.4704010.2079710.0257910.4039930.251412-0.0183070.277838-0.1104740.0669280.128539-0.1891150.133558-0.021053149.620
10.01.1918570.2661510.1664800.4481540.060018-0.082361-0.0788030.085102-0.255425-0.1669741.6127271.0652350.489095-0.1437720.6355580.463917-0.114805-0.183361-0.145783-0.069083-0.225775-0.6386720.101288-0.3398460.1671700.125895-0.0089830.0147242.690
21.0-1.358354-1.3401631.7732090.379780-0.5031981.8004990.7914610.247676-1.5146540.2076430.6245010.0660840.717293-0.1659462.345865-2.8900831.109969-0.121359-2.2618570.5249800.2479980.7716790.909412-0.689281-0.327642-0.139097-0.055353-0.059752378.660
31.0-0.966272-0.1852261.792993-0.863291-0.0103091.2472030.2376090.377436-1.387024-0.054952-0.2264870.1782280.507757-0.287924-0.631418-1.059647-0.6840931.965775-1.232622-0.208038-0.1083000.005274-0.190321-1.1755750.647376-0.2219290.0627230.061458123.500
42.0-1.1582330.8777371.5487180.403034-0.4071930.0959210.592941-0.2705330.8177390.753074-0.8228430.5381961.345852-1.1196700.175121-0.451449-0.237033-0.0381950.8034870.408542-0.0094310.798278-0.1374580.141267-0.2060100.5022920.2194220.21515369.990
52.0-0.4259660.9605231.141109-0.1682520.420987-0.0297280.4762010.260314-0.568671-0.3714071.3412620.359894-0.358091-0.1371340.5176170.401726-0.0581330.068653-0.0331940.084968-0.208254-0.559825-0.026398-0.371427-0.2327940.1059150.2538440.0810803.670
64.01.2296580.1410040.0453711.2026130.1918810.272708-0.0051590.0812130.464960-0.099254-1.416907-0.153826-0.7510630.1673720.050144-0.4435870.002821-0.611987-0.045575-0.219633-0.167716-0.270710-0.154104-0.7800550.750137-0.2572370.0345070.0051684.990
77.0-0.6442691.4179641.074380-0.4921990.9489340.4281181.120631-3.8078640.6153751.249376-0.6194680.2914741.757964-1.3238650.686133-0.076127-1.222127-0.3582220.324505-0.1567421.943465-1.0154550.057504-0.649709-0.415267-0.051634-1.206921-1.08533940.800
87.0-0.8942860.286157-0.113192-0.2715262.6695993.7218180.3701450.851084-0.392048-0.410430-0.705117-0.110452-0.2862540.074355-0.328783-0.210077-0.4997680.1187650.5703280.052736-0.073425-0.268092-0.2042331.0115920.373205-0.3841570.0117470.14240493.200
99.0-0.3382621.1195931.044367-0.2221870.499361-0.2467610.6515830.069539-0.736727-0.3668461.0176140.8363901.006844-0.4435230.1502190.739453-0.5409800.4766770.4517730.203711-0.246914-0.633753-0.120794-0.385050-0.0697330.0941990.2462190.0830763.680
\n", "
" ], "text/plain": [ " Time V1 V2 V3 V4 V5 V6 V7 \\\n", "0 0.0 -1.359807 -0.072781 2.536347 1.378155 -0.338321 0.462388 0.239599 \n", "1 0.0 1.191857 0.266151 0.166480 0.448154 0.060018 -0.082361 -0.078803 \n", "2 1.0 -1.358354 -1.340163 1.773209 0.379780 -0.503198 1.800499 0.791461 \n", "3 1.0 -0.966272 -0.185226 1.792993 -0.863291 -0.010309 1.247203 0.237609 \n", "4 2.0 -1.158233 0.877737 1.548718 0.403034 -0.407193 0.095921 0.592941 \n", "5 2.0 -0.425966 0.960523 1.141109 -0.168252 0.420987 -0.029728 0.476201 \n", "6 4.0 1.229658 0.141004 0.045371 1.202613 0.191881 0.272708 -0.005159 \n", "7 7.0 -0.644269 1.417964 1.074380 -0.492199 0.948934 0.428118 1.120631 \n", "8 7.0 -0.894286 0.286157 -0.113192 -0.271526 2.669599 3.721818 0.370145 \n", "9 9.0 -0.338262 1.119593 1.044367 -0.222187 0.499361 -0.246761 0.651583 \n", "\n", " V8 V9 V10 V11 V12 V13 V14 \\\n", "0 0.098698 0.363787 0.090794 -0.551600 -0.617801 -0.991390 -0.311169 \n", "1 0.085102 -0.255425 -0.166974 1.612727 1.065235 0.489095 -0.143772 \n", "2 0.247676 -1.514654 0.207643 0.624501 0.066084 0.717293 -0.165946 \n", "3 0.377436 -1.387024 -0.054952 -0.226487 0.178228 0.507757 -0.287924 \n", "4 -0.270533 0.817739 0.753074 -0.822843 0.538196 1.345852 -1.119670 \n", "5 0.260314 -0.568671 -0.371407 1.341262 0.359894 -0.358091 -0.137134 \n", "6 0.081213 0.464960 -0.099254 -1.416907 -0.153826 -0.751063 0.167372 \n", "7 -3.807864 0.615375 1.249376 -0.619468 0.291474 1.757964 -1.323865 \n", "8 0.851084 -0.392048 -0.410430 -0.705117 -0.110452 -0.286254 0.074355 \n", "9 0.069539 -0.736727 -0.366846 1.017614 0.836390 1.006844 -0.443523 \n", "\n", " V15 V16 V17 V18 V19 V20 V21 \\\n", "0 1.468177 -0.470401 0.207971 0.025791 0.403993 0.251412 -0.018307 \n", "1 0.635558 0.463917 -0.114805 -0.183361 -0.145783 -0.069083 -0.225775 \n", "2 2.345865 -2.890083 1.109969 -0.121359 -2.261857 0.524980 0.247998 \n", "3 -0.631418 -1.059647 -0.684093 1.965775 -1.232622 -0.208038 -0.108300 \n", "4 0.175121 -0.451449 -0.237033 -0.038195 0.803487 0.408542 -0.009431 \n", "5 0.517617 0.401726 -0.058133 0.068653 -0.033194 0.084968 -0.208254 \n", "6 0.050144 -0.443587 0.002821 -0.611987 -0.045575 -0.219633 -0.167716 \n", "7 0.686133 -0.076127 -1.222127 -0.358222 0.324505 -0.156742 1.943465 \n", "8 -0.328783 -0.210077 -0.499768 0.118765 0.570328 0.052736 -0.073425 \n", "9 0.150219 0.739453 -0.540980 0.476677 0.451773 0.203711 -0.246914 \n", "\n", " V22 V23 V24 V25 V26 V27 V28 \\\n", "0 0.277838 -0.110474 0.066928 0.128539 -0.189115 0.133558 -0.021053 \n", "1 -0.638672 0.101288 -0.339846 0.167170 0.125895 -0.008983 0.014724 \n", "2 0.771679 0.909412 -0.689281 -0.327642 -0.139097 -0.055353 -0.059752 \n", "3 0.005274 -0.190321 -1.175575 0.647376 -0.221929 0.062723 0.061458 \n", "4 0.798278 -0.137458 0.141267 -0.206010 0.502292 0.219422 0.215153 \n", "5 -0.559825 -0.026398 -0.371427 -0.232794 0.105915 0.253844 0.081080 \n", "6 -0.270710 -0.154104 -0.780055 0.750137 -0.257237 0.034507 0.005168 \n", "7 -1.015455 0.057504 -0.649709 -0.415267 -0.051634 -1.206921 -1.085339 \n", "8 -0.268092 -0.204233 1.011592 0.373205 -0.384157 0.011747 0.142404 \n", "9 -0.633753 -0.120794 -0.385050 -0.069733 0.094199 0.246219 0.083076 \n", "\n", " Amount Class \n", "0 149.62 0 \n", "1 2.69 0 \n", "2 378.66 0 \n", "3 123.50 0 \n", "4 69.99 0 \n", "5 3.67 0 \n", "6 4.99 0 \n", "7 40.80 0 \n", "8 93.20 0 \n", "9 3.68 0 " ] }, "execution_count": 27, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.head(10)" ] }, { "cell_type": "code", "execution_count": 28, "id": "3530430c", "metadata": {}, "outputs": [], "source": [ "X = df.drop(\"Class\", axis=1)\n", "y = df[\"Class\"]" ] }, { "cell_type": "code", "execution_count": 29, "id": "c66ea2e2", "metadata": {}, "outputs": [], "source": [ "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=100, stratify=y)" ] }, { "cell_type": "code", "execution_count": 30, "id": "1e628dd0", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "X_train (199364, 30)\n", "X_test (85443, 30)\n", "y_train (199364,)\n", "y_test (85443,)\n" ] } ], "source": [ "print('X_train ', X_train.shape)\n", "print('X_test ', X_test.shape)\n", "print('y_train ', y_train.shape)\n", "print('y_test ', y_test.shape)" ] }, { "cell_type": "code", "execution_count": 31, "id": "8908147f", "metadata": {}, "outputs": [], "source": [ "parameters = [{'n_estimators': [10, 15],'max_features': np.arange(3, 5),'max_depth': np.arange(4, 7)}]" ] }, { "cell_type": "code", "execution_count": 32, "id": "8f593bef", "metadata": {}, "outputs": [], "source": [ "clf = GridSearchCV(\n", " estimator=RandomForestClassifier(random_state=100),\n", " param_grid=parameters,\n", " scoring='roc_auc',\n", " cv=3,\n", ")" ] }, { "cell_type": "code", "execution_count": 33, "id": "99d17337", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "GridSearchCV(cv=3, estimator=RandomForestClassifier(random_state=100),\n", " param_grid=[{'max_depth': array([4, 5, 6]),\n", " 'max_features': array([3, 4]),\n", " 'n_estimators': [10, 15]}],\n", " scoring='roc_auc')" ] }, "execution_count": 33, "metadata": {}, "output_type": "execute_result" } ], "source": [ "clf.fit(X_train, y_train)" ] }, { "cell_type": "code", "execution_count": 34, "id": "79f8c7e5", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "{'max_depth': 6, 'max_features': 3, 'n_estimators': 15}" ] }, "execution_count": 34, "metadata": {}, "output_type": "execute_result" } ], "source": [ "clf.best_params_" ] }, { "cell_type": "code", "execution_count": 35, "id": "74efab0a", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "RandomForestClassifier(max_depth=6, max_features=3, n_estimators=15)" ] }, "execution_count": 35, "metadata": {}, "output_type": "execute_result" } ], "source": [ "clf = RandomForestClassifier(max_depth=6, max_features=3, n_estimators=15)\n", "\n", "clf.fit(X_train, y_train)" ] }, { "cell_type": "code", "execution_count": 36, "id": "e4d1fe4a", "metadata": {}, "outputs": [], "source": [ "y_pred = clf.predict_proba(X_test)" ] }, { "cell_type": "code", "execution_count": 37, "id": "49357f79", "metadata": {}, "outputs": [], "source": [ "y_pred_proba = y_pred[:, 1]" ] }, { "cell_type": "code", "execution_count": 38, "id": "fc40ec74", "metadata": {}, "outputs": [], "source": [ "from sklearn.metrics import roc_auc_score" ] }, { "cell_type": "code", "execution_count": 39, "id": "84b0112b", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0.9476239854368701" ] }, "execution_count": 39, "metadata": {}, "output_type": "execute_result" } ], "source": [ "roc_auc_score(y_test, y_pred_proba)" ] }, { "cell_type": "markdown", "id": "475ac08f", "metadata": {}, "source": [ "# *Дополнительные задания:" ] }, { "cell_type": "markdown", "id": "3728fe05", "metadata": {}, "source": [ "Загрузите датасет Wine из встроенных датасетов sklearn.datasets с помощью функции load_wine в переменную data." ] }, { "cell_type": "code", "execution_count": 40, "id": "e0600074", "metadata": {}, "outputs": [], "source": [ "from sklearn.datasets import load_wine\n", "data = load_wine()" ] }, { "cell_type": "markdown", "id": "6e22dfd0", "metadata": {}, "source": [ "Полученный датасет не является датафреймом. Это структура данных, имеющая ключи аналогично словарю. Просмотрите тип данных этой структуры данных и создайте список data_keys, содержащий ее ключи." ] }, { "cell_type": "code", "execution_count": 41, "id": "1cc31a29", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " \n", "\n", "dict_keys(['data', 'target', 'frame', 'target_names', 'DESCR', 'feature_names'])\n" ] } ], "source": [ "print(type(data), '\\n')\n", "data_keys = data.keys()\n", "print(data_keys)" ] }, { "cell_type": "markdown", "id": "05b0491c", "metadata": {}, "source": [ "Просмотрите данные, описание и названия признаков в датасете. Описание нужно вывести в виде привычного, аккуратно оформленного текста, без обозначений переноса строки, но с самими переносами и т.д" ] }, { "cell_type": "code", "execution_count": 42, "id": "67997daa", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([[1.423e+01, 1.710e+00, 2.430e+00, ..., 1.040e+00, 3.920e+00,\n", " 1.065e+03],\n", " [1.320e+01, 1.780e+00, 2.140e+00, ..., 1.050e+00, 3.400e+00,\n", " 1.050e+03],\n", " [1.316e+01, 2.360e+00, 2.670e+00, ..., 1.030e+00, 3.170e+00,\n", " 1.185e+03],\n", " ...,\n", " [1.327e+01, 4.280e+00, 2.260e+00, ..., 5.900e-01, 1.560e+00,\n", " 8.350e+02],\n", " [1.317e+01, 2.590e+00, 2.370e+00, ..., 6.000e-01, 1.620e+00,\n", " 8.400e+02],\n", " [1.413e+01, 4.100e+00, 2.740e+00, ..., 6.100e-01, 1.600e+00,\n", " 5.600e+02]])" ] }, "execution_count": 42, "metadata": {}, "output_type": "execute_result" } ], "source": [ "data.data" ] }, { "cell_type": "code", "execution_count": 43, "id": "42bf0f1f", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ ".. _wine_dataset:\n", "\n", "Wine recognition dataset\n", "------------------------\n", "\n", "**Data Set Characteristics:**\n", "\n", " :Number of Instances: 178 (50 in each of three classes)\n", " :Number of Attributes: 13 numeric, predictive attributes and the class\n", " :Attribute Information:\n", " \t\t- Alcohol\n", " \t\t- Malic acid\n", " \t\t- Ash\n", "\t\t- Alcalinity of ash \n", " \t\t- Magnesium\n", "\t\t- Total phenols\n", " \t\t- Flavanoids\n", " \t\t- Nonflavanoid phenols\n", " \t\t- Proanthocyanins\n", "\t\t- Color intensity\n", " \t\t- Hue\n", " \t\t- OD280/OD315 of diluted wines\n", " \t\t- Proline\n", "\n", " - class:\n", " - class_0\n", " - class_1\n", " - class_2\n", "\t\t\n", " :Summary Statistics:\n", " \n", " ============================= ==== ===== ======= =====\n", " Min Max Mean SD\n", " ============================= ==== ===== ======= =====\n", " Alcohol: 11.0 14.8 13.0 0.8\n", " Malic Acid: 0.74 5.80 2.34 1.12\n", " Ash: 1.36 3.23 2.36 0.27\n", " Alcalinity of Ash: 10.6 30.0 19.5 3.3\n", " Magnesium: 70.0 162.0 99.7 14.3\n", " Total Phenols: 0.98 3.88 2.29 0.63\n", " Flavanoids: 0.34 5.08 2.03 1.00\n", " Nonflavanoid Phenols: 0.13 0.66 0.36 0.12\n", " Proanthocyanins: 0.41 3.58 1.59 0.57\n", " Colour Intensity: 1.3 13.0 5.1 2.3\n", " Hue: 0.48 1.71 0.96 0.23\n", " OD280/OD315 of diluted wines: 1.27 4.00 2.61 0.71\n", " Proline: 278 1680 746 315\n", " ============================= ==== ===== ======= =====\n", "\n", " :Missing Attribute Values: None\n", " :Class Distribution: class_0 (59), class_1 (71), class_2 (48)\n", " :Creator: R.A. Fisher\n", " :Donor: Michael Marshall (MARSHALL%PLU@io.arc.nasa.gov)\n", " :Date: July, 1988\n", "\n", "This is a copy of UCI ML Wine recognition datasets.\n", "https://archive.ics.uci.edu/ml/machine-learning-databases/wine/wine.data\n", "\n", "The data is the results of a chemical analysis of wines grown in the same\n", "region in Italy by three different cultivators. There are thirteen different\n", "measurements taken for different constituents found in the three types of\n", "wine.\n", "\n", "Original Owners: \n", "\n", "Forina, M. et al, PARVUS - \n", "An Extendible Package for Data Exploration, Classification and Correlation. \n", "Institute of Pharmaceutical and Food Analysis and Technologies,\n", "Via Brigata Salerno, 16147 Genoa, Italy.\n", "\n", "Citation:\n", "\n", "Lichman, M. (2013). UCI Machine Learning Repository\n", "[https://archive.ics.uci.edu/ml]. Irvine, CA: University of California,\n", "School of Information and Computer Science. \n", "\n", ".. topic:: References\n", "\n", " (1) S. Aeberhard, D. Coomans and O. de Vel, \n", " Comparison of Classifiers in High Dimensional Settings, \n", " Tech. Rep. no. 92-02, (1992), Dept. of Computer Science and Dept. of \n", " Mathematics and Statistics, James Cook University of North Queensland. \n", " (Also submitted to Technometrics). \n", "\n", " The data was used with many others for comparing various \n", " classifiers. The classes are separable, though only RDA \n", " has achieved 100% correct classification. \n", " (RDA : 100%, QDA 99.4%, LDA 98.9%, 1NN 96.1% (z-transformed data)) \n", " (All results using the leave-one-out technique) \n", "\n", " (2) S. Aeberhard, D. Coomans and O. de Vel, \n", " \"THE CLASSIFICATION PERFORMANCE OF RDA\" \n", " Tech. Rep. no. 92-01, (1992), Dept. of Computer Science and Dept. of \n", " Mathematics and Statistics, James Cook University of North Queensland. \n", " (Also submitted to Journal of Chemometrics).\n", "\n" ] } ], "source": [ "print(data.DESCR)" ] }, { "cell_type": "code", "execution_count": 44, "id": "3990394e", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['alcohol',\n", " 'malic_acid',\n", " 'ash',\n", " 'alcalinity_of_ash',\n", " 'magnesium',\n", " 'total_phenols',\n", " 'flavanoids',\n", " 'nonflavanoid_phenols',\n", " 'proanthocyanins',\n", " 'color_intensity',\n", " 'hue',\n", " 'od280/od315_of_diluted_wines',\n", " 'proline']" ] }, "execution_count": 44, "metadata": {}, "output_type": "execute_result" } ], "source": [ "data.feature_names" ] }, { "cell_type": "markdown", "id": "ca3e3b90", "metadata": {}, "source": [ "Сколько классов содержит целевая переменная датасета? Выведите названия классов." ] }, { "cell_type": "code", "execution_count": 45, "id": "3dcc2473", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Количество классов: (3,) \n", "\n", "Названия классов: ['class_0' 'class_1' 'class_2']\n" ] } ], "source": [ "print('Количество классов: ', np.unique(data[\"target\"]).shape, '\\n')\n", "print('Названия классов: ',data[\"target_names\"])" ] }, { "cell_type": "markdown", "id": "4eb0d981", "metadata": {}, "source": [ "На основе данных датасета (они содержатся в двумерном массиве Numpy) и названий признаков создайте датафрейм под названием X." ] }, { "cell_type": "code", "execution_count": 46, "id": "52257354", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
alcoholmalic_acidashalcalinity_of_ashmagnesiumtotal_phenolsflavanoidsnonflavanoid_phenolsproanthocyaninscolor_intensityhueod280/od315_of_diluted_winesproline
014.231.712.4315.6127.02.803.060.282.295.641.043.921065.0
113.201.782.1411.2100.02.652.760.261.284.381.053.401050.0
213.162.362.6718.6101.02.803.240.302.815.681.033.171185.0
314.371.952.5016.8113.03.853.490.242.187.800.863.451480.0
413.242.592.8721.0118.02.802.690.391.824.321.042.93735.0
\n", "
" ], "text/plain": [ " alcohol malic_acid ash alcalinity_of_ash magnesium total_phenols \\\n", "0 14.23 1.71 2.43 15.6 127.0 2.80 \n", "1 13.20 1.78 2.14 11.2 100.0 2.65 \n", "2 13.16 2.36 2.67 18.6 101.0 2.80 \n", "3 14.37 1.95 2.50 16.8 113.0 3.85 \n", "4 13.24 2.59 2.87 21.0 118.0 2.80 \n", "\n", " flavanoids nonflavanoid_phenols proanthocyanins color_intensity hue \\\n", "0 3.06 0.28 2.29 5.64 1.04 \n", "1 2.76 0.26 1.28 4.38 1.05 \n", "2 3.24 0.30 2.81 5.68 1.03 \n", "3 3.49 0.24 2.18 7.80 0.86 \n", "4 2.69 0.39 1.82 4.32 1.04 \n", "\n", " od280/od315_of_diluted_wines proline \n", "0 3.92 1065.0 \n", "1 3.40 1050.0 \n", "2 3.17 1185.0 \n", "3 3.45 1480.0 \n", "4 2.93 735.0 " ] }, "execution_count": 46, "metadata": {}, "output_type": "execute_result" } ], "source": [ "X = pd.DataFrame(data.data, columns=data.feature_names)\n", "X.head()" ] }, { "cell_type": "markdown", "id": "917c33ed", "metadata": {}, "source": [ "Выясните размер датафрейма X и установите, имеются ли в нем пропущенные значения." ] }, { "cell_type": "code", "execution_count": 47, "id": "f66d1569", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(178, 13)" ] }, "execution_count": 47, "metadata": {}, "output_type": "execute_result" } ], "source": [ "X.shape" ] }, { "cell_type": "code", "execution_count": 48, "id": "4a1379f8", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "RangeIndex: 178 entries, 0 to 177\n", "Data columns (total 13 columns):\n", " # Column Non-Null Count Dtype \n", "--- ------ -------------- ----- \n", " 0 alcohol 178 non-null float64\n", " 1 malic_acid 178 non-null float64\n", " 2 ash 178 non-null float64\n", " 3 alcalinity_of_ash 178 non-null float64\n", " 4 magnesium 178 non-null float64\n", " 5 total_phenols 178 non-null float64\n", " 6 flavanoids 178 non-null float64\n", " 7 nonflavanoid_phenols 178 non-null float64\n", " 8 proanthocyanins 178 non-null float64\n", " 9 color_intensity 178 non-null float64\n", " 10 hue 178 non-null float64\n", " 11 od280/od315_of_diluted_wines 178 non-null float64\n", " 12 proline 178 non-null float64\n", "dtypes: float64(13)\n", "memory usage: 18.2 KB\n" ] } ], "source": [ "X.info()" ] }, { "cell_type": "code", "execution_count": 49, "id": "f5573521", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "alcohol 0\n", "malic_acid 0\n", "ash 0\n", "alcalinity_of_ash 0\n", "magnesium 0\n", "total_phenols 0\n", "flavanoids 0\n", "nonflavanoid_phenols 0\n", "proanthocyanins 0\n", "color_intensity 0\n", "hue 0\n", "od280/od315_of_diluted_wines 0\n", "proline 0\n", "dtype: int64" ] }, "execution_count": 49, "metadata": {}, "output_type": "execute_result" } ], "source": [ "X.isnull().astype(\"int\").sum()" ] }, { "cell_type": "markdown", "id": "7fcfb081", "metadata": {}, "source": [ "Добавьте в датафрейм поле с классами вин в виде чисел, имеющих тип данных numpy.int64. Название поля - 'target'." ] }, { "cell_type": "code", "execution_count": 50, "id": "89d0aa13", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "RangeIndex: 178 entries, 0 to 177\n", "Data columns (total 14 columns):\n", " # Column Non-Null Count Dtype \n", "--- ------ -------------- ----- \n", " 0 alcohol 178 non-null float64\n", " 1 malic_acid 178 non-null float64\n", " 2 ash 178 non-null float64\n", " 3 alcalinity_of_ash 178 non-null float64\n", " 4 magnesium 178 non-null float64\n", " 5 total_phenols 178 non-null float64\n", " 6 flavanoids 178 non-null float64\n", " 7 nonflavanoid_phenols 178 non-null float64\n", " 8 proanthocyanins 178 non-null float64\n", " 9 color_intensity 178 non-null float64\n", " 10 hue 178 non-null float64\n", " 11 od280/od315_of_diluted_wines 178 non-null float64\n", " 12 proline 178 non-null float64\n", " 13 target 178 non-null int64 \n", "dtypes: float64(13), int64(1)\n", "memory usage: 19.6 KB\n" ] } ], "source": [ "X[\"target\"]=data[\"target\"].astype(np.int64)\n", "X.info()" ] }, { "cell_type": "code", "execution_count": 51, "id": "50bcdef6", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
alcoholmalic_acidashalcalinity_of_ashmagnesiumtotal_phenolsflavanoidsnonflavanoid_phenolsproanthocyaninscolor_intensityhueod280/od315_of_diluted_winesprolinetarget
014.231.712.4315.6127.02.803.060.282.295.641.043.921065.00
113.201.782.1411.2100.02.652.760.261.284.381.053.401050.00
213.162.362.6718.6101.02.803.240.302.815.681.033.171185.00
314.371.952.5016.8113.03.853.490.242.187.800.863.451480.00
413.242.592.8721.0118.02.802.690.391.824.321.042.93735.00
\n", "
" ], "text/plain": [ " alcohol malic_acid ash alcalinity_of_ash magnesium total_phenols \\\n", "0 14.23 1.71 2.43 15.6 127.0 2.80 \n", "1 13.20 1.78 2.14 11.2 100.0 2.65 \n", "2 13.16 2.36 2.67 18.6 101.0 2.80 \n", "3 14.37 1.95 2.50 16.8 113.0 3.85 \n", "4 13.24 2.59 2.87 21.0 118.0 2.80 \n", "\n", " flavanoids nonflavanoid_phenols proanthocyanins color_intensity hue \\\n", "0 3.06 0.28 2.29 5.64 1.04 \n", "1 2.76 0.26 1.28 4.38 1.05 \n", "2 3.24 0.30 2.81 5.68 1.03 \n", "3 3.49 0.24 2.18 7.80 0.86 \n", "4 2.69 0.39 1.82 4.32 1.04 \n", "\n", " od280/od315_of_diluted_wines proline target \n", "0 3.92 1065.0 0 \n", "1 3.40 1050.0 0 \n", "2 3.17 1185.0 0 \n", "3 3.45 1480.0 0 \n", "4 2.93 735.0 0 " ] }, "execution_count": 51, "metadata": {}, "output_type": "execute_result" } ], "source": [ "X.head()" ] }, { "cell_type": "markdown", "id": "704ea79c", "metadata": {}, "source": [ "Постройте матрицу корреляций для всех полей X. Дайте полученному датафрейму название X_corr." ] }, { "cell_type": "code", "execution_count": 52, "id": "41d5c34c", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
alcoholmalic_acidashalcalinity_of_ashmagnesiumtotal_phenolsflavanoidsnonflavanoid_phenolsproanthocyaninscolor_intensityhueod280/od315_of_diluted_winesprolinetarget
alcohol1.0000000.0943970.211545-0.3102350.2707980.2891010.236815-0.1559290.1366980.546364-0.0717470.0723430.643720-0.328222
malic_acid0.0943971.0000000.1640450.288500-0.054575-0.335167-0.4110070.292977-0.2207460.248985-0.561296-0.368710-0.1920110.437776
ash0.2115450.1640451.0000000.4433670.2865870.1289800.1150770.1862300.0096520.258887-0.0746670.0039110.223626-0.049643
alcalinity_of_ash-0.3102350.2885000.4433671.000000-0.083333-0.321113-0.3513700.361922-0.1973270.018732-0.273955-0.276769-0.4405970.517859
magnesium0.270798-0.0545750.286587-0.0833331.0000000.2144010.195784-0.2562940.2364410.1999500.0553980.0660040.393351-0.209179
total_phenols0.289101-0.3351670.128980-0.3211130.2144011.0000000.864564-0.4499350.612413-0.0551360.4336810.6999490.498115-0.719163
flavanoids0.236815-0.4110070.115077-0.3513700.1957840.8645641.000000-0.5379000.652692-0.1723790.5434790.7871940.494193-0.847498
nonflavanoid_phenols-0.1559290.2929770.1862300.361922-0.256294-0.449935-0.5379001.000000-0.3658450.139057-0.262640-0.503270-0.3113850.489109
proanthocyanins0.136698-0.2207460.009652-0.1973270.2364410.6124130.652692-0.3658451.000000-0.0252500.2955440.5190670.330417-0.499130
color_intensity0.5463640.2489850.2588870.0187320.199950-0.055136-0.1723790.139057-0.0252501.000000-0.521813-0.4288150.3161000.265668
hue-0.071747-0.561296-0.074667-0.2739550.0553980.4336810.543479-0.2626400.295544-0.5218131.0000000.5654680.236183-0.617369
od280/od315_of_diluted_wines0.072343-0.3687100.003911-0.2767690.0660040.6999490.787194-0.5032700.519067-0.4288150.5654681.0000000.312761-0.788230
proline0.643720-0.1920110.223626-0.4405970.3933510.4981150.494193-0.3113850.3304170.3161000.2361830.3127611.000000-0.633717
target-0.3282220.437776-0.0496430.517859-0.209179-0.719163-0.8474980.489109-0.4991300.265668-0.617369-0.788230-0.6337171.000000
\n", "
" ], "text/plain": [ " alcohol malic_acid ash \\\n", "alcohol 1.000000 0.094397 0.211545 \n", "malic_acid 0.094397 1.000000 0.164045 \n", "ash 0.211545 0.164045 1.000000 \n", "alcalinity_of_ash -0.310235 0.288500 0.443367 \n", "magnesium 0.270798 -0.054575 0.286587 \n", "total_phenols 0.289101 -0.335167 0.128980 \n", "flavanoids 0.236815 -0.411007 0.115077 \n", "nonflavanoid_phenols -0.155929 0.292977 0.186230 \n", "proanthocyanins 0.136698 -0.220746 0.009652 \n", "color_intensity 0.546364 0.248985 0.258887 \n", "hue -0.071747 -0.561296 -0.074667 \n", "od280/od315_of_diluted_wines 0.072343 -0.368710 0.003911 \n", "proline 0.643720 -0.192011 0.223626 \n", "target -0.328222 0.437776 -0.049643 \n", "\n", " alcalinity_of_ash magnesium total_phenols \\\n", "alcohol -0.310235 0.270798 0.289101 \n", "malic_acid 0.288500 -0.054575 -0.335167 \n", "ash 0.443367 0.286587 0.128980 \n", "alcalinity_of_ash 1.000000 -0.083333 -0.321113 \n", "magnesium -0.083333 1.000000 0.214401 \n", "total_phenols -0.321113 0.214401 1.000000 \n", "flavanoids -0.351370 0.195784 0.864564 \n", "nonflavanoid_phenols 0.361922 -0.256294 -0.449935 \n", "proanthocyanins -0.197327 0.236441 0.612413 \n", "color_intensity 0.018732 0.199950 -0.055136 \n", "hue -0.273955 0.055398 0.433681 \n", "od280/od315_of_diluted_wines -0.276769 0.066004 0.699949 \n", "proline -0.440597 0.393351 0.498115 \n", "target 0.517859 -0.209179 -0.719163 \n", "\n", " flavanoids nonflavanoid_phenols \\\n", "alcohol 0.236815 -0.155929 \n", "malic_acid -0.411007 0.292977 \n", "ash 0.115077 0.186230 \n", "alcalinity_of_ash -0.351370 0.361922 \n", "magnesium 0.195784 -0.256294 \n", "total_phenols 0.864564 -0.449935 \n", "flavanoids 1.000000 -0.537900 \n", "nonflavanoid_phenols -0.537900 1.000000 \n", "proanthocyanins 0.652692 -0.365845 \n", "color_intensity -0.172379 0.139057 \n", "hue 0.543479 -0.262640 \n", "od280/od315_of_diluted_wines 0.787194 -0.503270 \n", "proline 0.494193 -0.311385 \n", "target -0.847498 0.489109 \n", "\n", " proanthocyanins color_intensity hue \\\n", "alcohol 0.136698 0.546364 -0.071747 \n", "malic_acid -0.220746 0.248985 -0.561296 \n", "ash 0.009652 0.258887 -0.074667 \n", "alcalinity_of_ash -0.197327 0.018732 -0.273955 \n", "magnesium 0.236441 0.199950 0.055398 \n", "total_phenols 0.612413 -0.055136 0.433681 \n", "flavanoids 0.652692 -0.172379 0.543479 \n", "nonflavanoid_phenols -0.365845 0.139057 -0.262640 \n", "proanthocyanins 1.000000 -0.025250 0.295544 \n", "color_intensity -0.025250 1.000000 -0.521813 \n", "hue 0.295544 -0.521813 1.000000 \n", "od280/od315_of_diluted_wines 0.519067 -0.428815 0.565468 \n", "proline 0.330417 0.316100 0.236183 \n", "target -0.499130 0.265668 -0.617369 \n", "\n", " od280/od315_of_diluted_wines proline target \n", "alcohol 0.072343 0.643720 -0.328222 \n", "malic_acid -0.368710 -0.192011 0.437776 \n", "ash 0.003911 0.223626 -0.049643 \n", "alcalinity_of_ash -0.276769 -0.440597 0.517859 \n", "magnesium 0.066004 0.393351 -0.209179 \n", "total_phenols 0.699949 0.498115 -0.719163 \n", "flavanoids 0.787194 0.494193 -0.847498 \n", "nonflavanoid_phenols -0.503270 -0.311385 0.489109 \n", "proanthocyanins 0.519067 0.330417 -0.499130 \n", "color_intensity -0.428815 0.316100 0.265668 \n", "hue 0.565468 0.236183 -0.617369 \n", "od280/od315_of_diluted_wines 1.000000 0.312761 -0.788230 \n", "proline 0.312761 1.000000 -0.633717 \n", "target -0.788230 -0.633717 1.000000 " ] }, "execution_count": 52, "metadata": {}, "output_type": "execute_result" } ], "source": [ "X_corr = X.corr()\n", "X_corr" ] }, { "cell_type": "markdown", "id": "1ed7c122", "metadata": {}, "source": [ "Создайте список high_corr из признаков, корреляция которых с полем target по абсолютному значению превышает 0.5 (причем, само поле target не должно входить в этот список)." ] }, { "cell_type": "code", "execution_count": 53, "id": "6edf6763", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "Index(['alcalinity_of_ash', 'total_phenols', 'flavanoids', 'hue',\n", " 'od280/od315_of_diluted_wines', 'proline'],\n", " dtype='object')" ] }, "execution_count": 53, "metadata": {}, "output_type": "execute_result" } ], "source": [ "high_corr = X_corr.loc[(X_corr.index != 'target') & (abs(X_corr['target']) > .5), X_corr.columns != 'target'].index\n", "high_corr" ] }, { "cell_type": "markdown", "id": "f0ff52e6", "metadata": {}, "source": [ "Удалите из датафрейма X поле с целевой переменной. Для всех признаков, названия которых содержатся в списке high_corr, вычислите квадрат их значений и добавьте в датафрейм X соответствующие поля с суффиксом '_2', добавленного к первоначальному названию признака. Итоговый датафрейм должен содержать все поля, которые, были в нем изначально, а также поля с признаками из списка high_corr, возведенными в квадрат. Выведите описание полей датафрейма X с помощью метода describe." ] }, { "cell_type": "code", "execution_count": 54, "id": "1e1403ec", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
alcoholmalic_acidashalcalinity_of_ashmagnesiumtotal_phenolsflavanoidsnonflavanoid_phenolsproanthocyaninscolor_intensityhueod280/od315_of_diluted_winesproline
014.231.712.4315.6127.02.803.060.282.295.641.043.921065.0
113.201.782.1411.2100.02.652.760.261.284.381.053.401050.0
213.162.362.6718.6101.02.803.240.302.815.681.033.171185.0
314.371.952.5016.8113.03.853.490.242.187.800.863.451480.0
413.242.592.8721.0118.02.802.690.391.824.321.042.93735.0
\n", "
" ], "text/plain": [ " alcohol malic_acid ash alcalinity_of_ash magnesium total_phenols \\\n", "0 14.23 1.71 2.43 15.6 127.0 2.80 \n", "1 13.20 1.78 2.14 11.2 100.0 2.65 \n", "2 13.16 2.36 2.67 18.6 101.0 2.80 \n", "3 14.37 1.95 2.50 16.8 113.0 3.85 \n", "4 13.24 2.59 2.87 21.0 118.0 2.80 \n", "\n", " flavanoids nonflavanoid_phenols proanthocyanins color_intensity hue \\\n", "0 3.06 0.28 2.29 5.64 1.04 \n", "1 2.76 0.26 1.28 4.38 1.05 \n", "2 3.24 0.30 2.81 5.68 1.03 \n", "3 3.49 0.24 2.18 7.80 0.86 \n", "4 2.69 0.39 1.82 4.32 1.04 \n", "\n", " od280/od315_of_diluted_wines proline \n", "0 3.92 1065.0 \n", "1 3.40 1050.0 \n", "2 3.17 1185.0 \n", "3 3.45 1480.0 \n", "4 2.93 735.0 " ] }, "execution_count": 54, "metadata": {}, "output_type": "execute_result" } ], "source": [ "X = X.drop('target', axis=1)\n", "X.head()" ] }, { "cell_type": "code", "execution_count": 56, "id": "74173e8d", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
alcoholmalic_acidashalcalinity_of_ashmagnesiumtotal_phenolsflavanoidsnonflavanoid_phenolsproanthocyaninscolor_intensityhueod280/od315_of_diluted_winesprolinealcalinity_of_ash_2total_phenols_2flavanoids_2hue_2od280/od315_of_diluted_wines_2proline_2
014.231.712.4315.6127.02.803.060.282.295.641.043.921065.0243.367.84009.36361.081615.36641134225.0
113.201.782.1411.2100.02.652.760.261.284.381.053.401050.0125.447.02257.61761.102511.56001102500.0
213.162.362.6718.6101.02.803.240.302.815.681.033.171185.0345.967.840010.49761.060910.04891404225.0
314.371.952.5016.8113.03.853.490.242.187.800.863.451480.0282.2414.822512.18010.739611.90252190400.0
413.242.592.8721.0118.02.802.690.391.824.321.042.93735.0441.007.84007.23611.08168.5849540225.0
\n", "
" ], "text/plain": [ " alcohol malic_acid ash alcalinity_of_ash magnesium total_phenols \\\n", "0 14.23 1.71 2.43 15.6 127.0 2.80 \n", "1 13.20 1.78 2.14 11.2 100.0 2.65 \n", "2 13.16 2.36 2.67 18.6 101.0 2.80 \n", "3 14.37 1.95 2.50 16.8 113.0 3.85 \n", "4 13.24 2.59 2.87 21.0 118.0 2.80 \n", "\n", " flavanoids nonflavanoid_phenols proanthocyanins color_intensity hue \\\n", "0 3.06 0.28 2.29 5.64 1.04 \n", "1 2.76 0.26 1.28 4.38 1.05 \n", "2 3.24 0.30 2.81 5.68 1.03 \n", "3 3.49 0.24 2.18 7.80 0.86 \n", "4 2.69 0.39 1.82 4.32 1.04 \n", "\n", " od280/od315_of_diluted_wines proline alcalinity_of_ash_2 \\\n", "0 3.92 1065.0 243.36 \n", "1 3.40 1050.0 125.44 \n", "2 3.17 1185.0 345.96 \n", "3 3.45 1480.0 282.24 \n", "4 2.93 735.0 441.00 \n", "\n", " total_phenols_2 flavanoids_2 hue_2 od280/od315_of_diluted_wines_2 \\\n", "0 7.8400 9.3636 1.0816 15.3664 \n", "1 7.0225 7.6176 1.1025 11.5600 \n", "2 7.8400 10.4976 1.0609 10.0489 \n", "3 14.8225 12.1801 0.7396 11.9025 \n", "4 7.8400 7.2361 1.0816 8.5849 \n", "\n", " proline_2 \n", "0 1134225.0 \n", "1 1102500.0 \n", "2 1404225.0 \n", "3 2190400.0 \n", "4 540225.0 " ] }, "execution_count": 56, "metadata": {}, "output_type": "execute_result" } ], "source": [ "for feature_name in high_corr:\n", " X['{0}_2'.format(feature_name)] = X[feature_name] ** 2\n", "\n", "X.head()" ] }, { "cell_type": "code", "execution_count": 57, "id": "190f74c0", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
alcoholmalic_acidashalcalinity_of_ashmagnesiumtotal_phenolsflavanoidsnonflavanoid_phenolsproanthocyaninscolor_intensityhueod280/od315_of_diluted_winesprolinealcalinity_of_ash_2total_phenols_2flavanoids_2hue_2od280/od315_of_diluted_wines_2proline_2
count178.000000178.000000178.000000178.000000178.000000178.000000178.000000178.000000178.000000178.000000178.000000178.000000178.000000178.000000178.000000178.000000178.000000178.0000001.780000e+02
mean13.0006182.3363482.36651719.49494499.7415732.2951122.0292700.3618541.5908995.0580900.9574492.611685746.893258391.1428655.6570305.1100490.9686617.3221556.564591e+05
std0.8118271.1171460.2743443.33956414.2824840.6258510.9988590.1244530.5723592.3182860.2285720.709990314.907474133.6717752.9362944.2114410.4437983.5843165.558591e+05
min11.0300000.7400001.36000010.60000070.0000000.9800000.3400000.1300000.4100001.2800000.4800001.270000278.000000112.3600000.9604000.1156000.2304001.6129007.728400e+04
25%12.3625001.6025002.21000017.20000088.0000001.7425001.2050000.2700001.2500003.2200000.7825001.937500500.500000295.8400003.0363251.4521000.6123253.7540752.505010e+05
50%13.0500001.8650002.36000019.50000098.0000002.3550002.1350000.3400001.5550004.6900000.9650002.780000673.500000380.2500005.5460504.5582500.9312507.7284004.536045e+05
75%13.6775003.0825002.55750021.500000107.0000002.8000002.8750000.4375001.9500006.2000001.1200003.170000985.000000462.2500007.8400008.2657001.25440010.0489009.702250e+05
max14.8300005.8000003.23000030.000000162.0000003.8800005.0800000.6600003.58000013.0000001.7100004.0000001680.000000900.00000015.05440025.8064002.92410016.0000002.822400e+06
\n", "
" ], "text/plain": [ " alcohol malic_acid ash alcalinity_of_ash magnesium \\\n", "count 178.000000 178.000000 178.000000 178.000000 178.000000 \n", "mean 13.000618 2.336348 2.366517 19.494944 99.741573 \n", "std 0.811827 1.117146 0.274344 3.339564 14.282484 \n", "min 11.030000 0.740000 1.360000 10.600000 70.000000 \n", "25% 12.362500 1.602500 2.210000 17.200000 88.000000 \n", "50% 13.050000 1.865000 2.360000 19.500000 98.000000 \n", "75% 13.677500 3.082500 2.557500 21.500000 107.000000 \n", "max 14.830000 5.800000 3.230000 30.000000 162.000000 \n", "\n", " total_phenols flavanoids nonflavanoid_phenols proanthocyanins \\\n", "count 178.000000 178.000000 178.000000 178.000000 \n", "mean 2.295112 2.029270 0.361854 1.590899 \n", "std 0.625851 0.998859 0.124453 0.572359 \n", "min 0.980000 0.340000 0.130000 0.410000 \n", "25% 1.742500 1.205000 0.270000 1.250000 \n", "50% 2.355000 2.135000 0.340000 1.555000 \n", "75% 2.800000 2.875000 0.437500 1.950000 \n", "max 3.880000 5.080000 0.660000 3.580000 \n", "\n", " color_intensity hue od280/od315_of_diluted_wines proline \\\n", "count 178.000000 178.000000 178.000000 178.000000 \n", "mean 5.058090 0.957449 2.611685 746.893258 \n", "std 2.318286 0.228572 0.709990 314.907474 \n", "min 1.280000 0.480000 1.270000 278.000000 \n", "25% 3.220000 0.782500 1.937500 500.500000 \n", "50% 4.690000 0.965000 2.780000 673.500000 \n", "75% 6.200000 1.120000 3.170000 985.000000 \n", "max 13.000000 1.710000 4.000000 1680.000000 \n", "\n", " alcalinity_of_ash_2 total_phenols_2 flavanoids_2 hue_2 \\\n", "count 178.000000 178.000000 178.000000 178.000000 \n", "mean 391.142865 5.657030 5.110049 0.968661 \n", "std 133.671775 2.936294 4.211441 0.443798 \n", "min 112.360000 0.960400 0.115600 0.230400 \n", "25% 295.840000 3.036325 1.452100 0.612325 \n", "50% 380.250000 5.546050 4.558250 0.931250 \n", "75% 462.250000 7.840000 8.265700 1.254400 \n", "max 900.000000 15.054400 25.806400 2.924100 \n", "\n", " od280/od315_of_diluted_wines_2 proline_2 \n", "count 178.000000 1.780000e+02 \n", "mean 7.322155 6.564591e+05 \n", "std 3.584316 5.558591e+05 \n", "min 1.612900 7.728400e+04 \n", "25% 3.754075 2.505010e+05 \n", "50% 7.728400 4.536045e+05 \n", "75% 10.048900 9.702250e+05 \n", "max 16.000000 2.822400e+06 " ] }, "execution_count": 57, "metadata": {}, "output_type": "execute_result" } ], "source": [ "X.describe()" ] }, { "cell_type": "code", "execution_count": null, "id": "97f44af7", "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.9.12" } }, "nbformat": 4, "nbformat_minor": 5 }