{
"cells": [
{
"cell_type": "markdown",
"id": "626646d0",
"metadata": {},
"source": [
"# Курсовой проект для курса \"Python для Data Science\"\n",
"\n",
"## Info\n",
"\n",
"Задание:\n",
"Используя данные из train.csv, построить\n",
"модель для предсказания цен на недвижимость (квартиры).\n",
"С помощью полученной модели предсказать\n",
"цены для квартир из файла test.csv.\n",
"\n",
"Целевая переменная:\n",
"Price\n",
"\n",
"Метрика:\n",
"R2 - коэффициент детерминации (sklearn.metrics.r2_score)\n",
"\n",
"Сдача проекта:\n",
"1. Прислать в раздел Задания Урока 10 (\"Вебинар. Консультация по итоговому проекту\")\n",
"ссылку на программу в github (программа должна содержаться в файле Jupyter Notebook \n",
"с расширением ipynb). (Pull request не нужен, только ссылка ведущая на сам скрипт).\n",
"2. Приложить файл с названием по образцу SShirkin_predictions.csv\n",
"с предсказанными ценами для квартир из test.csv (файл должен содержать два поля: Id, Price).\n",
"В файле с предсказаниями должна быть 5001 строка (шапка + 5000 предсказаний)."
]
},
{
"cell_type": "code",
"execution_count": 126,
"id": "497b5df9",
"metadata": {},
"outputs": [],
"source": [
"import numpy as np\n",
"import pandas as pd\n",
"import matplotlib.pyplot as plt\n",
"import seaborn as sns\n",
"\n",
"from sklearn.cluster import KMeans\n",
"from sklearn.manifold import TSNE\n",
"\n",
"from sklearn.metrics import r2_score\n",
"from sklearn.metrics import mean_squared_error\n",
"\n",
"from sklearn.model_selection import StratifiedKFold\n",
"from sklearn.model_selection import train_test_split, cross_val_score\n",
"from sklearn.model_selection import KFold, GridSearchCV\n",
"\n",
"from sklearn.linear_model import LinearRegression\n",
"from sklearn.ensemble import RandomForestRegressor\n",
"from sklearn.ensemble import GradientBoostingRegressor\n",
"from sklearn.linear_model import LassoCV\n",
"from xgboost import XGBRegressor\n",
"from lightgbm import LGBMRegressor\n",
"\n",
"from scipy.stats import norm\n",
"from scipy import stats\n",
"\n",
"%matplotlib inline\n",
"%config InlineBackend.figure_format = 'svg'\n",
"plt.rcParams['font.size'] = '8'\n",
"\n",
"from datetime import datetime\n",
"import warnings"
]
},
{
"cell_type": "markdown",
"id": "f6228421",
"metadata": {},
"source": [
"дополнительные функции"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "f41d1f5c",
"metadata": {},
"outputs": [],
"source": [
"# преобразование\n",
"def optimizing_df(df):\n",
" for col in df.columns:\n",
" if df[col].dtypes.kind == 'i' or df[col].dtypes.kind == 'u':\n",
" if df[col].min() >= 0:\n",
" df[col] = pd.to_numeric(df[col], downcast='unsigned')\n",
" else:\n",
" df[col] = pd.to_numeric(df[col], downcast='integer')\n",
"\n",
" elif df[col].dtypes.kind == 'f' or df[col].dtypes.kind == 'c':\n",
" df[col] = pd.to_numeric(df[col], downcast='float')\n",
"\n",
" elif df[col].dtypes.kind == 'O':\n",
" num_unique_values = len(df[col].unique())\n",
" num_total_values = len(df[col])\n",
" if num_unique_values / num_total_values < 0.5:\n",
" df[col] = df[col].astype('category')\n",
"\n",
" return df"
]
},
{
"cell_type": "code",
"execution_count": 69,
"id": "92c4d645",
"metadata": {},
"outputs": [],
"source": [
"def model_test(model, name, test, valid):\n",
" model_pred = model.predict(test)\n",
" r2 = r2_score(valid, model_pred)\n",
" mse = mean_squared_error(valid, model_pred)\n",
" plt.scatter(valid, (model_pred - valid))\n",
" plt.xlabel(\"Predicted values\")\n",
" plt.ylabel(\"Real values\")\n",
" plt.title(name)\n",
" plt.legend([f'R2= {r2:.4f} and mse= {mse:.0e}'])\n",
" plt.axhline(0, color='red')\n",
" plt.show()"
]
},
{
"cell_type": "code",
"execution_count": 129,
"id": "ef22d95a",
"metadata": {},
"outputs": [],
"source": [
"def models_r2(models, test, valid):\n",
" scores = pd.DataFrame(columns=['name', 'r2', 'mse'])\n",
" for name, model in models.items():\n",
" test_pred = model.predict(test)\n",
" r2 = r2_score(valid, test_pred)\n",
" mse = mean_squared_error(valid, test_pred)\n",
" with warnings.catch_warnings():\n",
" warnings.filterwarnings(\"ignore\")\n",
" scores = scores.append(\n",
" {'name': name, 'r2': r2, 'mse': mse}, ignore_index=True)\n",
" \n",
" scores.sort_values('r2', ascending=False, inplace=True)\n",
" return scores"
]
},
{
"cell_type": "markdown",
"id": "2efb54f9",
"metadata": {},
"source": [
"загрузка данных"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "bee6ed0c",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Train:\t10000\tстрок\t 20\t столбцов\n",
"Test:\t5000\tстрок\t 19\t столбцов\n"
]
}
],
"source": [
"train = pd.read_csv('data_set/train.csv');\n",
"test = pd.read_csv('data_set/test.csv');\n",
"\n",
"print(f\"Train:\\t{train.shape[0]}\\tстрок\\t {train.shape[1]}\\t столбцов\");\n",
"print(f\"Test:\\t{test.shape[0]}\\tстрок\\t {test.shape[1]}\\t столбцов\");"
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "30985213",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"
\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" Id | \n",
" DistrictId | \n",
" Rooms | \n",
" Square | \n",
" LifeSquare | \n",
" KitchenSquare | \n",
" Floor | \n",
" HouseFloor | \n",
" HouseYear | \n",
" Ecology_1 | \n",
" Ecology_2 | \n",
" Ecology_3 | \n",
" Social_1 | \n",
" Social_2 | \n",
" Social_3 | \n",
" Healthcare_1 | \n",
" Helthcare_2 | \n",
" Shops_1 | \n",
" Shops_2 | \n",
" Price | \n",
"
\n",
" \n",
" \n",
" \n",
" | 9997 | \n",
" 5123 | \n",
" 27 | \n",
" 1.0 | \n",
" 47.939008 | \n",
" NaN | \n",
" 1.0 | \n",
" 12 | \n",
" 16.0 | \n",
" 2015 | \n",
" 0.072158 | \n",
" B | \n",
" B | \n",
" 2 | \n",
" 629 | \n",
" 1 | \n",
" NaN | \n",
" 0 | \n",
" 0 | \n",
" A | \n",
" 159143.805370 | \n",
"
\n",
" \n",
" | 9998 | \n",
" 5400 | \n",
" 75 | \n",
" 2.0 | \n",
" 43.602562 | \n",
" 33.840147 | \n",
" 8.0 | \n",
" 1 | \n",
" 5.0 | \n",
" 1961 | \n",
" 0.307467 | \n",
" B | \n",
" A | \n",
" 30 | \n",
" 5048 | \n",
" 9 | \n",
" 325.0 | \n",
" 2 | \n",
" 5 | \n",
" B | \n",
" 181595.339808 | \n",
"
\n",
" \n",
" | 9999 | \n",
" 6306 | \n",
" 128 | \n",
" 1.0 | \n",
" 38.666645 | \n",
" 21.157874 | \n",
" 8.0 | \n",
" 7 | \n",
" 17.0 | \n",
" 1990 | \n",
" 0.000000 | \n",
" B | \n",
" B | \n",
" 27 | \n",
" 4798 | \n",
" 0 | \n",
" 30.0 | \n",
" 2 | \n",
" 8 | \n",
" B | \n",
" 218714.077615 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" Id DistrictId Rooms Square LifeSquare KitchenSquare Floor \\\n",
"9997 5123 27 1.0 47.939008 NaN 1.0 12 \n",
"9998 5400 75 2.0 43.602562 33.840147 8.0 1 \n",
"9999 6306 128 1.0 38.666645 21.157874 8.0 7 \n",
"\n",
" HouseFloor HouseYear Ecology_1 Ecology_2 Ecology_3 Social_1 \\\n",
"9997 16.0 2015 0.072158 B B 2 \n",
"9998 5.0 1961 0.307467 B A 30 \n",
"9999 17.0 1990 0.000000 B B 27 \n",
"\n",
" Social_2 Social_3 Healthcare_1 Helthcare_2 Shops_1 Shops_2 \\\n",
"9997 629 1 NaN 0 0 A \n",
"9998 5048 9 325.0 2 5 B \n",
"9999 4798 0 30.0 2 8 B \n",
"\n",
" Price \n",
"9997 159143.805370 \n",
"9998 181595.339808 \n",
"9999 218714.077615 "
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"train.tail(n=3)"
]
},
{
"cell_type": "code",
"execution_count": 6,
"id": "b8311efd",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" Id | \n",
" DistrictId | \n",
" Rooms | \n",
" Square | \n",
" LifeSquare | \n",
" KitchenSquare | \n",
" Floor | \n",
" HouseFloor | \n",
" HouseYear | \n",
" Ecology_1 | \n",
" Ecology_2 | \n",
" Ecology_3 | \n",
" Social_1 | \n",
" Social_2 | \n",
" Social_3 | \n",
" Healthcare_1 | \n",
" Helthcare_2 | \n",
" Shops_1 | \n",
" Shops_2 | \n",
"
\n",
" \n",
" \n",
" \n",
" | 4997 | \n",
" 5783 | \n",
" 12 | \n",
" 3.0 | \n",
" 77.842178 | \n",
" 48.282625 | \n",
" 9.0 | \n",
" 23 | \n",
" 22.0 | \n",
" 1989 | \n",
" 0.090799 | \n",
" B | \n",
" B | \n",
" 74 | \n",
" 19083 | \n",
" 2 | \n",
" NaN | \n",
" 5 | \n",
" 15 | \n",
" B | \n",
"
\n",
" \n",
" | 4998 | \n",
" 4780 | \n",
" 62 | \n",
" 2.0 | \n",
" 81.305222 | \n",
" NaN | \n",
" 0.0 | \n",
" 4 | \n",
" 0.0 | \n",
" 1977 | \n",
" 0.072158 | \n",
" B | \n",
" B | \n",
" 2 | \n",
" 629 | \n",
" 1 | \n",
" NaN | \n",
" 0 | \n",
" 0 | \n",
" A | \n",
"
\n",
" \n",
" | 4999 | \n",
" 12504 | \n",
" 30 | \n",
" 2.0 | \n",
" 60.555693 | \n",
" NaN | \n",
" 1.0 | \n",
" 10 | \n",
" 17.0 | \n",
" 1977 | \n",
" 0.000078 | \n",
" B | \n",
" B | \n",
" 22 | \n",
" 6398 | \n",
" 141 | \n",
" 1046.0 | \n",
" 3 | \n",
" 23 | \n",
" B | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" Id DistrictId Rooms Square LifeSquare KitchenSquare Floor \\\n",
"4997 5783 12 3.0 77.842178 48.282625 9.0 23 \n",
"4998 4780 62 2.0 81.305222 NaN 0.0 4 \n",
"4999 12504 30 2.0 60.555693 NaN 1.0 10 \n",
"\n",
" HouseFloor HouseYear Ecology_1 Ecology_2 Ecology_3 Social_1 \\\n",
"4997 22.0 1989 0.090799 B B 74 \n",
"4998 0.0 1977 0.072158 B B 2 \n",
"4999 17.0 1977 0.000078 B B 22 \n",
"\n",
" Social_2 Social_3 Healthcare_1 Helthcare_2 Shops_1 Shops_2 \n",
"4997 19083 2 NaN 5 15 B \n",
"4998 629 1 NaN 0 0 A \n",
"4999 6398 141 1046.0 3 23 B "
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"test.tail(n=3)"
]
},
{
"cell_type": "markdown",
"id": "04eedfdc",
"metadata": {},
"source": [
"Приведение типов данных"
]
},
{
"cell_type": "code",
"execution_count": 7,
"id": "4ea9db07",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"RangeIndex: 10000 entries, 0 to 9999\n",
"Data columns (total 20 columns):\n",
" # Column Non-Null Count Dtype \n",
"--- ------ -------------- ----- \n",
" 0 Id 10000 non-null int64 \n",
" 1 DistrictId 10000 non-null int64 \n",
" 2 Rooms 10000 non-null float64\n",
" 3 Square 10000 non-null float64\n",
" 4 LifeSquare 7887 non-null float64\n",
" 5 KitchenSquare 10000 non-null float64\n",
" 6 Floor 10000 non-null int64 \n",
" 7 HouseFloor 10000 non-null float64\n",
" 8 HouseYear 10000 non-null int64 \n",
" 9 Ecology_1 10000 non-null float64\n",
" 10 Ecology_2 10000 non-null object \n",
" 11 Ecology_3 10000 non-null object \n",
" 12 Social_1 10000 non-null int64 \n",
" 13 Social_2 10000 non-null int64 \n",
" 14 Social_3 10000 non-null int64 \n",
" 15 Healthcare_1 5202 non-null float64\n",
" 16 Helthcare_2 10000 non-null int64 \n",
" 17 Shops_1 10000 non-null int64 \n",
" 18 Shops_2 10000 non-null object \n",
" 19 Price 10000 non-null float64\n",
"dtypes: float64(8), int64(9), object(3)\n",
"memory usage: 3.0 MB\n"
]
}
],
"source": [
"train.info(memory_usage='deep')"
]
},
{
"cell_type": "code",
"execution_count": 8,
"id": "648a893d",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"RangeIndex: 5000 entries, 0 to 4999\n",
"Data columns (total 19 columns):\n",
" # Column Non-Null Count Dtype \n",
"--- ------ -------------- ----- \n",
" 0 Id 5000 non-null int64 \n",
" 1 DistrictId 5000 non-null int64 \n",
" 2 Rooms 5000 non-null float64\n",
" 3 Square 5000 non-null float64\n",
" 4 LifeSquare 3959 non-null float64\n",
" 5 KitchenSquare 5000 non-null float64\n",
" 6 Floor 5000 non-null int64 \n",
" 7 HouseFloor 5000 non-null float64\n",
" 8 HouseYear 5000 non-null int64 \n",
" 9 Ecology_1 5000 non-null float64\n",
" 10 Ecology_2 5000 non-null object \n",
" 11 Ecology_3 5000 non-null object \n",
" 12 Social_1 5000 non-null int64 \n",
" 13 Social_2 5000 non-null int64 \n",
" 14 Social_3 5000 non-null int64 \n",
" 15 Healthcare_1 2623 non-null float64\n",
" 16 Helthcare_2 5000 non-null int64 \n",
" 17 Shops_1 5000 non-null int64 \n",
" 18 Shops_2 5000 non-null object \n",
"dtypes: float64(7), int64(9), object(3)\n",
"memory usage: 1.4 MB\n"
]
}
],
"source": [
"test.info(memory_usage='deep')"
]
},
{
"cell_type": "markdown",
"id": "9ca3008f",
"metadata": {},
"source": [
"изменим некоторые типы данных"
]
},
{
"cell_type": "code",
"execution_count": 9,
"id": "a7e1d2f7",
"metadata": {},
"outputs": [],
"source": [
"train['Rooms'] = train['Rooms'].astype('int64')\n",
"test['Rooms'] = test['Rooms'].astype('int64')\n",
"\n",
"\n",
"train['HouseFloor'] = train['HouseFloor'].astype('int64')\n",
"test['HouseFloor'] = test['HouseFloor'].astype('int64')\n",
"\n",
"train = optimizing_df(train)\n",
"test = optimizing_df(test)"
]
},
{
"cell_type": "code",
"execution_count": 10,
"id": "9f1f81d9",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"RangeIndex: 10000 entries, 0 to 9999\n",
"Data columns (total 20 columns):\n",
" # Column Non-Null Count Dtype \n",
"--- ------ -------------- ----- \n",
" 0 Id 10000 non-null uint16 \n",
" 1 DistrictId 10000 non-null uint8 \n",
" 2 Rooms 10000 non-null uint8 \n",
" 3 Square 10000 non-null float32 \n",
" 4 LifeSquare 7887 non-null float32 \n",
" 5 KitchenSquare 10000 non-null float32 \n",
" 6 Floor 10000 non-null uint8 \n",
" 7 HouseFloor 10000 non-null uint8 \n",
" 8 HouseYear 10000 non-null uint32 \n",
" 9 Ecology_1 10000 non-null float32 \n",
" 10 Ecology_2 10000 non-null category\n",
" 11 Ecology_3 10000 non-null category\n",
" 12 Social_1 10000 non-null uint8 \n",
" 13 Social_2 10000 non-null uint16 \n",
" 14 Social_3 10000 non-null uint8 \n",
" 15 Healthcare_1 5202 non-null float32 \n",
" 16 Helthcare_2 10000 non-null uint8 \n",
" 17 Shops_1 10000 non-null uint8 \n",
" 18 Shops_2 10000 non-null category\n",
" 19 Price 10000 non-null float32 \n",
"dtypes: category(3), float32(6), uint16(2), uint32(1), uint8(8)\n",
"memory usage: 420.7 KB\n"
]
}
],
"source": [
"train.info(memory_usage='deep')"
]
},
{
"cell_type": "code",
"execution_count": 11,
"id": "d4431418",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"RangeIndex: 5000 entries, 0 to 4999\n",
"Data columns (total 19 columns):\n",
" # Column Non-Null Count Dtype \n",
"--- ------ -------------- ----- \n",
" 0 Id 5000 non-null uint16 \n",
" 1 DistrictId 5000 non-null uint8 \n",
" 2 Rooms 5000 non-null uint8 \n",
" 3 Square 5000 non-null float32 \n",
" 4 LifeSquare 3959 non-null float32 \n",
" 5 KitchenSquare 5000 non-null float32 \n",
" 6 Floor 5000 non-null uint8 \n",
" 7 HouseFloor 5000 non-null uint8 \n",
" 8 HouseYear 5000 non-null uint16 \n",
" 9 Ecology_1 5000 non-null float32 \n",
" 10 Ecology_2 5000 non-null category\n",
" 11 Ecology_3 5000 non-null category\n",
" 12 Social_1 5000 non-null uint8 \n",
" 13 Social_2 5000 non-null uint16 \n",
" 14 Social_3 5000 non-null uint8 \n",
" 15 Healthcare_1 2623 non-null float32 \n",
" 16 Helthcare_2 5000 non-null uint8 \n",
" 17 Shops_1 5000 non-null uint8 \n",
" 18 Shops_2 5000 non-null category\n",
"dtypes: category(3), float32(5), uint16(3), uint8(8)\n",
"memory usage: 181.4 KB\n"
]
}
],
"source": [
"test.info(memory_usage='deep')"
]
},
{
"cell_type": "markdown",
"id": "ffe2f0bc",
"metadata": {},
"source": [
"DistrictId и Id нужно отнести к категориальным признакам."
]
},
{
"cell_type": "code",
"execution_count": 12,
"id": "5fc60692",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"RangeIndex: 10000 entries, 0 to 9999\n",
"Data columns (total 20 columns):\n",
" # Column Non-Null Count Dtype \n",
"--- ------ -------------- ----- \n",
" 0 Id 10000 non-null object \n",
" 1 DistrictId 10000 non-null object \n",
" 2 Rooms 10000 non-null uint8 \n",
" 3 Square 10000 non-null float32 \n",
" 4 LifeSquare 7887 non-null float32 \n",
" 5 KitchenSquare 10000 non-null float32 \n",
" 6 Floor 10000 non-null uint8 \n",
" 7 HouseFloor 10000 non-null uint8 \n",
" 8 HouseYear 10000 non-null uint32 \n",
" 9 Ecology_1 10000 non-null float32 \n",
" 10 Ecology_2 10000 non-null category\n",
" 11 Ecology_3 10000 non-null category\n",
" 12 Social_1 10000 non-null uint8 \n",
" 13 Social_2 10000 non-null uint16 \n",
" 14 Social_3 10000 non-null uint8 \n",
" 15 Healthcare_1 5202 non-null float32 \n",
" 16 Helthcare_2 10000 non-null uint8 \n",
" 17 Shops_1 10000 non-null uint8 \n",
" 18 Shops_2 10000 non-null category\n",
" 19 Price 10000 non-null float32 \n",
"dtypes: category(3), float32(6), object(2), uint16(1), uint32(1), uint8(7)\n",
"memory usage: 547.4+ KB\n"
]
}
],
"source": [
"train.DistrictId=train.DistrictId.astype('object')\n",
"train.Id=train.Id.astype('object')\n",
"train.info()"
]
},
{
"cell_type": "markdown",
"id": "5991c5d7",
"metadata": {},
"source": [
"Посмотрим на распределение данных"
]
},
{
"cell_type": "code",
"execution_count": 13,
"id": "bdd32427",
"metadata": {},
"outputs": [
{
"data": {
"image/svg+xml": [
"\n",
"\n",
"\n"
],
"text/plain": [
""
]
},
"metadata": {
"needs_background": "light"
},
"output_type": "display_data"
}
],
"source": [
"digital_features=train.select_dtypes(exclude=['object'])\n",
"digital_features.hist(figsize=(12,10), bins=30)\n",
"plt.show()"
]
},
{
"cell_type": "code",
"execution_count": 14,
"id": "6141fa4e",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" count | \n",
" mean | \n",
" std | \n",
" min | \n",
" 25% | \n",
" 50% | \n",
" 75% | \n",
" max | \n",
"
\n",
" \n",
" \n",
" \n",
" | Rooms | \n",
" 10000.0 | \n",
" 1.890500 | \n",
" 0.839512 | \n",
" 0.000000 | \n",
" 1.000000 | \n",
" 2.000000 | \n",
" 2.000000 | \n",
" 1.900000e+01 | \n",
"
\n",
" \n",
" | Square | \n",
" 10000.0 | \n",
" 56.315712 | \n",
" 21.058718 | \n",
" 1.136859 | \n",
" 41.774879 | \n",
" 52.513309 | \n",
" 65.900627 | \n",
" 6.410652e+02 | \n",
"
\n",
" \n",
" | LifeSquare | \n",
" 7887.0 | \n",
" 37.199596 | \n",
" 86.241112 | \n",
" 0.370619 | \n",
" 22.769833 | \n",
" 32.781261 | \n",
" 45.128803 | \n",
" 7.480592e+03 | \n",
"
\n",
" \n",
" | KitchenSquare | \n",
" 10000.0 | \n",
" 6.273300 | \n",
" 28.561113 | \n",
" 0.000000 | \n",
" 1.000000 | \n",
" 6.000000 | \n",
" 9.000000 | \n",
" 2.014000e+03 | \n",
"
\n",
" \n",
" | Floor | \n",
" 10000.0 | \n",
" 8.526700 | \n",
" 5.241148 | \n",
" 1.000000 | \n",
" 4.000000 | \n",
" 7.000000 | \n",
" 12.000000 | \n",
" 4.200000e+01 | \n",
"
\n",
" \n",
" | HouseFloor | \n",
" 10000.0 | \n",
" 12.609400 | \n",
" 6.775974 | \n",
" 0.000000 | \n",
" 9.000000 | \n",
" 13.000000 | \n",
" 17.000000 | \n",
" 1.170000e+02 | \n",
"
\n",
" \n",
" | HouseYear | \n",
" 10000.0 | \n",
" 3990.166300 | \n",
" 200500.261427 | \n",
" 1910.000000 | \n",
" 1974.000000 | \n",
" 1977.000000 | \n",
" 2001.000000 | \n",
" 2.005201e+07 | \n",
"
\n",
" \n",
" | Ecology_1 | \n",
" 10000.0 | \n",
" 0.118858 | \n",
" 0.119026 | \n",
" 0.000000 | \n",
" 0.017647 | \n",
" 0.075424 | \n",
" 0.195781 | \n",
" 5.218670e-01 | \n",
"
\n",
" \n",
" | Social_1 | \n",
" 10000.0 | \n",
" 24.687000 | \n",
" 17.532614 | \n",
" 0.000000 | \n",
" 6.000000 | \n",
" 25.000000 | \n",
" 36.000000 | \n",
" 7.400000e+01 | \n",
"
\n",
" \n",
" | Social_2 | \n",
" 10000.0 | \n",
" 5352.157400 | \n",
" 4006.799803 | \n",
" 168.000000 | \n",
" 1564.000000 | \n",
" 5285.000000 | \n",
" 7227.000000 | \n",
" 1.908300e+04 | \n",
"
\n",
" \n",
" | Social_3 | \n",
" 10000.0 | \n",
" 8.039200 | \n",
" 23.831875 | \n",
" 0.000000 | \n",
" 0.000000 | \n",
" 2.000000 | \n",
" 5.000000 | \n",
" 1.410000e+02 | \n",
"
\n",
" \n",
" | Healthcare_1 | \n",
" 5202.0 | \n",
" 1142.904419 | \n",
" 1021.518982 | \n",
" 0.000000 | \n",
" 350.000000 | \n",
" 900.000000 | \n",
" 1548.000000 | \n",
" 4.849000e+03 | \n",
"
\n",
" \n",
" | Helthcare_2 | \n",
" 10000.0 | \n",
" 1.319500 | \n",
" 1.493601 | \n",
" 0.000000 | \n",
" 0.000000 | \n",
" 1.000000 | \n",
" 2.000000 | \n",
" 6.000000e+00 | \n",
"
\n",
" \n",
" | Shops_1 | \n",
" 10000.0 | \n",
" 4.231300 | \n",
" 4.806341 | \n",
" 0.000000 | \n",
" 1.000000 | \n",
" 3.000000 | \n",
" 6.000000 | \n",
" 2.300000e+01 | \n",
"
\n",
" \n",
" | Price | \n",
" 10000.0 | \n",
" 214139.218750 | \n",
" 92872.304688 | \n",
" 59174.777344 | \n",
" 153872.628906 | \n",
" 192269.648438 | \n",
" 249135.460938 | \n",
" 6.332334e+05 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" count mean std min \\\n",
"Rooms 10000.0 1.890500 0.839512 0.000000 \n",
"Square 10000.0 56.315712 21.058718 1.136859 \n",
"LifeSquare 7887.0 37.199596 86.241112 0.370619 \n",
"KitchenSquare 10000.0 6.273300 28.561113 0.000000 \n",
"Floor 10000.0 8.526700 5.241148 1.000000 \n",
"HouseFloor 10000.0 12.609400 6.775974 0.000000 \n",
"HouseYear 10000.0 3990.166300 200500.261427 1910.000000 \n",
"Ecology_1 10000.0 0.118858 0.119026 0.000000 \n",
"Social_1 10000.0 24.687000 17.532614 0.000000 \n",
"Social_2 10000.0 5352.157400 4006.799803 168.000000 \n",
"Social_3 10000.0 8.039200 23.831875 0.000000 \n",
"Healthcare_1 5202.0 1142.904419 1021.518982 0.000000 \n",
"Helthcare_2 10000.0 1.319500 1.493601 0.000000 \n",
"Shops_1 10000.0 4.231300 4.806341 0.000000 \n",
"Price 10000.0 214139.218750 92872.304688 59174.777344 \n",
"\n",
" 25% 50% 75% max \n",
"Rooms 1.000000 2.000000 2.000000 1.900000e+01 \n",
"Square 41.774879 52.513309 65.900627 6.410652e+02 \n",
"LifeSquare 22.769833 32.781261 45.128803 7.480592e+03 \n",
"KitchenSquare 1.000000 6.000000 9.000000 2.014000e+03 \n",
"Floor 4.000000 7.000000 12.000000 4.200000e+01 \n",
"HouseFloor 9.000000 13.000000 17.000000 1.170000e+02 \n",
"HouseYear 1974.000000 1977.000000 2001.000000 2.005201e+07 \n",
"Ecology_1 0.017647 0.075424 0.195781 5.218670e-01 \n",
"Social_1 6.000000 25.000000 36.000000 7.400000e+01 \n",
"Social_2 1564.000000 5285.000000 7227.000000 1.908300e+04 \n",
"Social_3 0.000000 2.000000 5.000000 1.410000e+02 \n",
"Healthcare_1 350.000000 900.000000 1548.000000 4.849000e+03 \n",
"Helthcare_2 0.000000 1.000000 2.000000 6.000000e+00 \n",
"Shops_1 1.000000 3.000000 6.000000 2.300000e+01 \n",
"Price 153872.628906 192269.648438 249135.460938 6.332334e+05 "
]
},
"execution_count": 14,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"train.describe().transpose()"
]
},
{
"cell_type": "markdown",
"id": "a856dcd3",
"metadata": {},
"source": [
"### Square, LifeSquare, KitchenSquare"
]
},
{
"cell_type": "code",
"execution_count": 15,
"id": "3f3a684b",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"2113\n",
"1041\n"
]
}
],
"source": [
"print(train['LifeSquare'].isnull().sum())\n",
"print(test['LifeSquare'].isnull().sum())"
]
},
{
"cell_type": "markdown",
"id": "b0f1aba2",
"metadata": {},
"source": [
"LifeSquare имеет значение NULL в тестовых и в тренировочных данных."
]
},
{
"cell_type": "code",
"execution_count": 16,
"id": "1d0d7ae7",
"metadata": {},
"outputs": [],
"source": [
"train.loc[train['LifeSquare'].isnull(), 'LifeSquare'] = 0\n",
"test.loc[test['LifeSquare'].isnull(), 'LifeSquare'] = 0"
]
},
{
"cell_type": "code",
"execution_count": 17,
"id": "caab9a71",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" count | \n",
" mean | \n",
" std | \n",
" min | \n",
" 25% | \n",
" 50% | \n",
" 75% | \n",
" max | \n",
"
\n",
" \n",
" \n",
" \n",
" | Id | \n",
" 5000.0 | \n",
" 8412.595400 | \n",
" 4832.674037 | \n",
" 1.000000 | \n",
" 4221.750000 | \n",
" 8320.500000 | \n",
" 12598.250000 | \n",
" 16795.000000 | \n",
"
\n",
" \n",
" | DistrictId | \n",
" 5000.0 | \n",
" 51.279200 | \n",
" 44.179466 | \n",
" 0.000000 | \n",
" 21.000000 | \n",
" 37.000000 | \n",
" 77.000000 | \n",
" 212.000000 | \n",
"
\n",
" \n",
" | Rooms | \n",
" 5000.0 | \n",
" 1.910000 | \n",
" 0.838594 | \n",
" 0.000000 | \n",
" 1.000000 | \n",
" 2.000000 | \n",
" 2.000000 | \n",
" 17.000000 | \n",
"
\n",
" \n",
" | Square | \n",
" 5000.0 | \n",
" 56.449501 | \n",
" 19.092793 | \n",
" 1.378543 | \n",
" 41.906230 | \n",
" 52.921339 | \n",
" 66.285130 | \n",
" 223.453690 | \n",
"
\n",
" \n",
" | LifeSquare | \n",
" 5000.0 | \n",
" 28.630507 | \n",
" 21.613903 | \n",
" 0.000000 | \n",
" 17.873732 | \n",
" 29.118838 | \n",
" 41.769526 | \n",
" 303.071106 | \n",
"
\n",
" \n",
" | KitchenSquare | \n",
" 5000.0 | \n",
" 5.976800 | \n",
" 9.950147 | \n",
" 0.000000 | \n",
" 1.000000 | \n",
" 6.000000 | \n",
" 9.000000 | \n",
" 620.000000 | \n",
"
\n",
" \n",
" | Floor | \n",
" 5000.0 | \n",
" 8.632000 | \n",
" 5.483228 | \n",
" 1.000000 | \n",
" 4.000000 | \n",
" 7.000000 | \n",
" 12.000000 | \n",
" 78.000000 | \n",
"
\n",
" \n",
" | HouseFloor | \n",
" 5000.0 | \n",
" 12.601000 | \n",
" 6.789213 | \n",
" 0.000000 | \n",
" 9.000000 | \n",
" 12.000000 | \n",
" 17.000000 | \n",
" 99.000000 | \n",
"
\n",
" \n",
" | HouseYear | \n",
" 5000.0 | \n",
" 1984.392600 | \n",
" 18.573149 | \n",
" 1908.000000 | \n",
" 1973.000000 | \n",
" 1977.000000 | \n",
" 2000.000000 | \n",
" 2020.000000 | \n",
"
\n",
" \n",
" | Ecology_1 | \n",
" 5000.0 | \n",
" 0.119874 | \n",
" 0.120070 | \n",
" 0.000000 | \n",
" 0.019509 | \n",
" 0.072158 | \n",
" 0.195781 | \n",
" 0.521867 | \n",
"
\n",
" \n",
" | Social_1 | \n",
" 5000.0 | \n",
" 24.933800 | \n",
" 17.532202 | \n",
" 0.000000 | \n",
" 6.000000 | \n",
" 25.000000 | \n",
" 36.000000 | \n",
" 74.000000 | \n",
"
\n",
" \n",
" | Social_2 | \n",
" 5000.0 | \n",
" 5406.900000 | \n",
" 4026.614773 | \n",
" 168.000000 | \n",
" 1564.000000 | \n",
" 5285.000000 | \n",
" 7287.000000 | \n",
" 19083.000000 | \n",
"
\n",
" \n",
" | Social_3 | \n",
" 5000.0 | \n",
" 8.262600 | \n",
" 23.863762 | \n",
" 0.000000 | \n",
" 0.000000 | \n",
" 2.000000 | \n",
" 5.000000 | \n",
" 141.000000 | \n",
"
\n",
" \n",
" | Healthcare_1 | \n",
" 2623.0 | \n",
" 1146.657227 | \n",
" 1044.744995 | \n",
" 0.000000 | \n",
" 325.000000 | \n",
" 900.000000 | \n",
" 1548.000000 | \n",
" 4849.000000 | \n",
"
\n",
" \n",
" | Helthcare_2 | \n",
" 5000.0 | \n",
" 1.319400 | \n",
" 1.479940 | \n",
" 0.000000 | \n",
" 0.000000 | \n",
" 1.000000 | \n",
" 2.000000 | \n",
" 6.000000 | \n",
"
\n",
" \n",
" | Shops_1 | \n",
" 5000.0 | \n",
" 4.242800 | \n",
" 4.777365 | \n",
" 0.000000 | \n",
" 1.000000 | \n",
" 3.000000 | \n",
" 6.000000 | \n",
" 23.000000 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" count mean std min 25% \\\n",
"Id 5000.0 8412.595400 4832.674037 1.000000 4221.750000 \n",
"DistrictId 5000.0 51.279200 44.179466 0.000000 21.000000 \n",
"Rooms 5000.0 1.910000 0.838594 0.000000 1.000000 \n",
"Square 5000.0 56.449501 19.092793 1.378543 41.906230 \n",
"LifeSquare 5000.0 28.630507 21.613903 0.000000 17.873732 \n",
"KitchenSquare 5000.0 5.976800 9.950147 0.000000 1.000000 \n",
"Floor 5000.0 8.632000 5.483228 1.000000 4.000000 \n",
"HouseFloor 5000.0 12.601000 6.789213 0.000000 9.000000 \n",
"HouseYear 5000.0 1984.392600 18.573149 1908.000000 1973.000000 \n",
"Ecology_1 5000.0 0.119874 0.120070 0.000000 0.019509 \n",
"Social_1 5000.0 24.933800 17.532202 0.000000 6.000000 \n",
"Social_2 5000.0 5406.900000 4026.614773 168.000000 1564.000000 \n",
"Social_3 5000.0 8.262600 23.863762 0.000000 0.000000 \n",
"Healthcare_1 2623.0 1146.657227 1044.744995 0.000000 325.000000 \n",
"Helthcare_2 5000.0 1.319400 1.479940 0.000000 0.000000 \n",
"Shops_1 5000.0 4.242800 4.777365 0.000000 1.000000 \n",
"\n",
" 50% 75% max \n",
"Id 8320.500000 12598.250000 16795.000000 \n",
"DistrictId 37.000000 77.000000 212.000000 \n",
"Rooms 2.000000 2.000000 17.000000 \n",
"Square 52.921339 66.285130 223.453690 \n",
"LifeSquare 29.118838 41.769526 303.071106 \n",
"KitchenSquare 6.000000 9.000000 620.000000 \n",
"Floor 7.000000 12.000000 78.000000 \n",
"HouseFloor 12.000000 17.000000 99.000000 \n",
"HouseYear 1977.000000 2000.000000 2020.000000 \n",
"Ecology_1 0.072158 0.195781 0.521867 \n",
"Social_1 25.000000 36.000000 74.000000 \n",
"Social_2 5285.000000 7287.000000 19083.000000 \n",
"Social_3 2.000000 5.000000 141.000000 \n",
"Healthcare_1 900.000000 1548.000000 4849.000000 \n",
"Helthcare_2 1.000000 2.000000 6.000000 \n",
"Shops_1 3.000000 6.000000 23.000000 "
]
},
"execution_count": 17,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"test.describe().transpose()"
]
},
{
"cell_type": "markdown",
"id": "980ca1c5",
"metadata": {},
"source": [
"В тестовых данных показатель 'Square' максимально равно 223,5. Необходимо понять, сколько объектов с большей площадью в тренировочных данных.\n",
"\n",
"В тестовых данных показатель 'LifeSquare' максимально равно 303,5. Необходимо понять, сколько объектов с большей площадью в тренировочных данных.\n",
"\n",
"В тестовых данных показатель 'KitchenSquare' максимально равно 620. Скорее всего это ошибочное значение, но оно есть в\n",
"тестовых данных. Необходимо понять, сколько объектов с большей площадью в тренировочных данных."
]
},
{
"cell_type": "code",
"execution_count": 18,
"id": "295cd4ef",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Всего 10 Элементов, удалим их из выборки\n"
]
}
],
"source": [
"sq = train.loc[train['Square'] > 223.5, 'Square']\n",
"lsq = train.loc[train['LifeSquare'] > 303.5, 'LifeSquare']\n",
"ksq = train.loc[train['KitchenSquare'] > 620, 'KitchenSquare']\n",
"\n",
"all_data = pd.concat((sq, lsq, ksq), sort=False).reset_index(drop=True)\n",
"#len(all_data.unique())\n",
"\n",
"print(f'Всего {len(all_data.unique())} Элементов, удалим их из выборки')"
]
},
{
"cell_type": "code",
"execution_count": 19,
"id": "a6976c0c",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"9992"
]
},
"execution_count": 19,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"train = train.loc[train['Square'] < 223.5]\n",
"train = train.loc[train['LifeSquare'] < 303.5]\n",
"train = train.loc[train['KitchenSquare'] < 620]\n",
"train.index.size"
]
},
{
"cell_type": "markdown",
"id": "0edafb5a",
"metadata": {},
"source": [
"### HouseYear"
]
},
{
"cell_type": "code",
"execution_count": 20,
"id": "ed9833f5",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"--- HouseYear ---\n",
"1497 20052011\n",
"4189 4968\n",
"Name: HouseYear, dtype: uint32\n"
]
}
],
"source": [
"print('--- HouseYear ---')\n",
"print(train.loc[train['HouseYear'] > datetime.now().year, 'HouseYear'])"
]
},
{
"cell_type": "code",
"execution_count": 21,
"id": "a50a5c5c",
"metadata": {},
"outputs": [],
"source": [
"# всего 2 выброса - справимся в ручную - заменим\n",
"train.loc[train['HouseYear'] == 20052011, 'HouseYear'] = 2005\n",
"train.loc[train['HouseYear'] == 4968, 'HouseYear'] = 1968"
]
},
{
"cell_type": "markdown",
"id": "7b976628",
"metadata": {},
"source": [
"### Rooms"
]
},
{
"cell_type": "code",
"execution_count": 22,
"id": "ebea7f62",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"2 3878\n",
"1 3702\n",
"3 2233\n",
"4 150\n",
"5 17\n",
"0 8\n",
"10 2\n",
"19 1\n",
"6 1\n",
"Name: Rooms, dtype: int64"
]
},
"execution_count": 22,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"train['Rooms'].value_counts()"
]
},
{
"cell_type": "code",
"execution_count": 23,
"id": "179af69a",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"2.0"
]
},
"execution_count": 23,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# в выбросы подставим медиану\n",
"train.loc[train['Rooms'].isin([0, 6, 10, 19]), 'Rooms'] = train['Rooms'].median()\n",
"train['Rooms'].median()"
]
},
{
"cell_type": "markdown",
"id": "b904a246",
"metadata": {},
"source": [
"### Foor, HouseFloor"
]
},
{
"cell_type": "code",
"execution_count": 24,
"id": "5e22fa8d",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"1822\n"
]
}
],
"source": [
"# этаж выше общего количества этажей. странные данные.... но пока оставим\n",
"print(train.loc[train['Floor'] > train['HouseFloor']].index.size)"
]
},
{
"cell_type": "code",
"execution_count": 25,
"id": "305c41fe",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"884\n"
]
}
],
"source": [
"# такие данные есть и в тестовых\n",
"print(test.loc[test['Floor'] > test['HouseFloor']].index.size)"
]
},
{
"cell_type": "markdown",
"id": "ea0eaca5",
"metadata": {},
"source": [
"### Healthcare_1"
]
},
{
"cell_type": "code",
"execution_count": 26,
"id": "150f2b54",
"metadata": {},
"outputs": [
{
"data": {
"image/svg+xml": [
"\n",
"\n",
"\n"
],
"text/plain": [
""
]
},
"metadata": {
"needs_background": "light"
},
"output_type": "display_data"
}
],
"source": [
"HC_Null = train.loc[train['Healthcare_1'].isnull(), ['DistrictId']].astype('int64')\n",
"HC_Fill = train.loc[train['Healthcare_1'].notnull(), ['DistrictId']].astype('int64')\n",
"\n",
"plt.hist(HC_Null['DistrictId'].to_numpy(), bins=30, density=True, alpha=0.5, label='NULL', color='grey')\n",
"plt.hist(HC_Fill['DistrictId'].to_numpy(), bins=30, density=True, alpha=0.5, label='Not NULL', color='green')\n",
"\n",
"plt.show()"
]
},
{
"cell_type": "markdown",
"id": "8cc1f821",
"metadata": {},
"source": [
"Видно, что основная часть незаполненных (серых) значений находятся в кварталах с начальными номерами от 0 до 100\n",
"то и NULL вполне может быть правильным значением, заменим его на \"0\""
]
},
{
"cell_type": "code",
"execution_count": 27,
"id": "b2503ed8",
"metadata": {},
"outputs": [],
"source": [
"train.loc[train['Healthcare_1'].isnull(), 'Healthcare_1'] = 0\n",
"test.loc[test['Healthcare_1'].isnull(), 'Healthcare_1'] = 0"
]
},
{
"cell_type": "markdown",
"id": "0aa308b6",
"metadata": {},
"source": [
"что мы имеем по незаполненыь данным"
]
},
{
"cell_type": "code",
"execution_count": 28,
"id": "1fc628e9",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"Id 0\n",
"DistrictId 0\n",
"Rooms 0\n",
"Square 0\n",
"LifeSquare 0\n",
"KitchenSquare 0\n",
"Floor 0\n",
"HouseFloor 0\n",
"HouseYear 0\n",
"Ecology_1 0\n",
"Ecology_2 0\n",
"Ecology_3 0\n",
"Social_1 0\n",
"Social_2 0\n",
"Social_3 0\n",
"Healthcare_1 0\n",
"Helthcare_2 0\n",
"Shops_1 0\n",
"Shops_2 0\n",
"Price 0\n",
"dtype: int64"
]
},
"execution_count": 28,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"train.isnull().sum()"
]
},
{
"cell_type": "code",
"execution_count": 29,
"id": "d63832bc",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"Id 0\n",
"DistrictId 0\n",
"Rooms 0\n",
"Square 0\n",
"LifeSquare 0\n",
"KitchenSquare 0\n",
"Floor 0\n",
"HouseFloor 0\n",
"HouseYear 0\n",
"Ecology_1 0\n",
"Ecology_2 0\n",
"Ecology_3 0\n",
"Social_1 0\n",
"Social_2 0\n",
"Social_3 0\n",
"Healthcare_1 0\n",
"Helthcare_2 0\n",
"Shops_1 0\n",
"Shops_2 0\n",
"dtype: int64"
]
},
"execution_count": 29,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"test.isnull().sum()"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "f8288396",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": 30,
"id": "919eae2c",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
""
]
},
"execution_count": 30,
"metadata": {},
"output_type": "execute_result"
},
{
"data": {
"image/svg+xml": [
"\n",
"\n",
"\n"
],
"text/plain": [
""
]
},
"metadata": {
"needs_background": "light"
},
"output_type": "display_data"
}
],
"source": [
"corrmat = train.corr()\n",
"plt.subplots(figsize=(12, 10))\n",
"sns.heatmap(corrmat, cmap='coolwarm', annot = True, linewidths=0.3, square=True)"
]
},
{
"cell_type": "markdown",
"id": "39797bdc",
"metadata": {},
"source": [
"###### Коэффициенты коррекляции в с показателем 'Price' больше всего у параметров 'Rooms' и 'Square'. Но, если выводить эти таблицы для каждого из кварталов, то картина будет немного иной и будет не совсем верно игнорировать тот факт, что цена хорошо корреклирует с общей площадью в рамках каждого квартала.\n",
"Поэтому добовляем столбец 'PriceForMetr'. Рассчитываться это значение будет для каждого кваратала \"DistrictID\". И будет добавлено в тестовый набор."
]
},
{
"cell_type": "code",
"execution_count": 31,
"id": "665ffa87",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"8"
]
},
"execution_count": 31,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"gp1 = train.groupby(['DistrictId'])['Square'].sum()\n",
"gp2 = train.groupby(['DistrictId'])['Price'].sum()\n",
"gp3 = gp2 / gp1\n",
"\n",
"train['PriceForMetr'] = train['DistrictId'].map(gp3.to_dict())\n",
"test['PriceForMetr'] = test['DistrictId'].map(gp3.to_dict())\n",
"\n",
"#test.info(memory_usage=\"deep\")\n",
"test['PriceForMetr'].isnull().sum()"
]
},
{
"cell_type": "code",
"execution_count": 32,
"id": "4e0a8dcc",
"metadata": {},
"outputs": [],
"source": [
"# есть незаполненые значения - заполним их медианой\n",
"test.loc[test['PriceForMetr'].isnull(), 'PriceForMetr'] = test['PriceForMetr'].median()"
]
},
{
"cell_type": "markdown",
"id": "140432e8",
"metadata": {},
"source": [
"преобразуеь 'Ecology_2', 'Ecology_3', 'Shops_2' в бинарные признаки."
]
},
{
"cell_type": "code",
"execution_count": 33,
"id": "4e9a176b",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Ecology_2 in train \n",
"\n",
"B 9895\n",
"A 97\n",
"Name: Ecology_2, dtype: int64\n",
"****************************************************************************************************\n",
"\n",
"Ecology_2 in test \n",
"\n",
"B 4952\n",
"A 48\n",
"Name: Ecology_2, dtype: int64\n",
"****************************************************************************************************\n",
"\n",
"Ecology_3 in train \n",
"\n",
"B 9718\n",
"A 274\n",
"Name: Ecology_3, dtype: int64\n",
"****************************************************************************************************\n",
"\n",
"Ecology_3 in test \n",
"\n",
"B 4851\n",
"A 149\n",
"Name: Ecology_3, dtype: int64\n",
"****************************************************************************************************\n",
"\n",
"Shops_2 in train \n",
"\n",
"B 9168\n",
"A 824\n",
"Name: Shops_2, dtype: int64\n",
"****************************************************************************************************\n",
"\n",
"Shops_2 in test \n",
"\n",
"B 4588\n",
"A 412\n",
"Name: Shops_2, dtype: int64\n",
"****************************************************************************************************\n",
"\n"
]
}
],
"source": [
"for cat_colname in ['Ecology_2', 'Ecology_3', 'Shops_2']:\n",
" print(str(cat_colname) + ' in train \\n\\n' + str(train[cat_colname].value_counts()) + '\\n' + '*' * 100 + '\\n')\n",
" print(str(cat_colname) + ' in test \\n\\n' + str(test[cat_colname].value_counts()) + '\\n' + '*' * 100 + '\\n')"
]
},
{
"cell_type": "code",
"execution_count": 34,
"id": "d1f72ce1",
"metadata": {},
"outputs": [],
"source": [
"train['Ecology_2_bin'] = train['Ecology_2'].replace({'A':0, 'B':1})\n",
"train['Ecology_3_bin'] = train['Ecology_3'].replace({'A':0, 'B':1})\n",
"train['Shops_2_bin'] = train['Shops_2'].replace({'A':0, 'B':1})\n",
"\n",
"test['Ecology_2_bin'] = test['Ecology_2'].replace({'A':0, 'B':1})\n",
"test['Ecology_3_bin'] = test['Ecology_3'].replace({'A':0, 'B':1})\n",
"test['Shops_2_bin'] = test['Shops_2'].replace({'A':0, 'B':1})"
]
},
{
"cell_type": "markdown",
"id": "8588b213",
"metadata": {},
"source": [
"## МОДЕЛЬ"
]
},
{
"cell_type": "code",
"execution_count": 35,
"id": "c007dbdb",
"metadata": {},
"outputs": [],
"source": [
"models_dict = {}"
]
},
{
"cell_type": "code",
"execution_count": 36,
"id": "463a71bb",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"['Rooms', 'Square', 'LifeSquare', 'KitchenSquare', 'Floor', 'HouseFloor', 'HouseYear', 'Ecology_1', 'Social_1', 'Social_2', 'Social_3', 'Healthcare_1', 'Helthcare_2', 'Shops_1', 'PriceForMetr']\n"
]
}
],
"source": [
"features = train.select_dtypes(exclude=['object', 'string', 'category']).columns.tolist()\n",
"features.remove('Price')\n",
"print(features)\n",
"\n",
"target = 'Price'"
]
},
{
"cell_type": "code",
"execution_count": 37,
"id": "8fb0b73d",
"metadata": {},
"outputs": [],
"source": [
"X_train, X_test, y_train, y_test = train_test_split(\n",
" train[features], train[target], test_size=0.3, random_state=42)"
]
},
{
"cell_type": "code",
"execution_count": 38,
"id": "0033e835",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"Int64Index: 6994 entries, 3174 to 7276\n",
"Data columns (total 15 columns):\n",
" # Column Non-Null Count Dtype \n",
"--- ------ -------------- ----- \n",
" 0 Rooms 6994 non-null uint8 \n",
" 1 Square 6994 non-null float32\n",
" 2 LifeSquare 6994 non-null float32\n",
" 3 KitchenSquare 6994 non-null float32\n",
" 4 Floor 6994 non-null uint8 \n",
" 5 HouseFloor 6994 non-null uint8 \n",
" 6 HouseYear 6994 non-null uint32 \n",
" 7 Ecology_1 6994 non-null float32\n",
" 8 Social_1 6994 non-null uint8 \n",
" 9 Social_2 6994 non-null uint16 \n",
" 10 Social_3 6994 non-null uint8 \n",
" 11 Healthcare_1 6994 non-null float32\n",
" 12 Helthcare_2 6994 non-null uint8 \n",
" 13 Shops_1 6994 non-null uint8 \n",
" 14 PriceForMetr 6994 non-null float64\n",
"dtypes: float32(5), float64(1), uint16(1), uint32(1), uint8(7)\n",
"memory usage: 334.7 KB\n"
]
}
],
"source": [
"X_train.info()"
]
},
{
"cell_type": "markdown",
"id": "79f4145d",
"metadata": {},
"source": [
"## Linear Regression"
]
},
{
"cell_type": "code",
"execution_count": 39,
"id": "bd6efb33",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"LinearRegression()"
]
},
"execution_count": 39,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"line_regression_model = LinearRegression()\n",
"line_regression_model.fit(X_train, y_train)"
]
},
{
"cell_type": "code",
"execution_count": 40,
"id": "19c6a858",
"metadata": {},
"outputs": [],
"source": [
"models_dict['Linear Regression'] = line_regression_model"
]
},
{
"cell_type": "code",
"execution_count": 41,
"id": "550ec302",
"metadata": {},
"outputs": [
{
"data": {
"image/svg+xml": [
"\n",
"\n",
"\n"
],
"text/plain": [
""
]
},
"metadata": {
"needs_background": "light"
},
"output_type": "display_data"
}
],
"source": [
"model_test(line_regression_model, 'Linear Regression', X_test, y_test)"
]
},
{
"cell_type": "markdown",
"id": "1968f6c6",
"metadata": {},
"source": [
"## Random Forest Regressor"
]
},
{
"cell_type": "code",
"execution_count": 42,
"id": "24499b5c",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"RandomForestRegressor()"
]
},
"execution_count": 42,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"random_forest_regressor_model = RandomForestRegressor()\n",
"random_forest_regressor_model.fit(X_train, y_train)"
]
},
{
"cell_type": "code",
"execution_count": 43,
"id": "f12c76db",
"metadata": {},
"outputs": [],
"source": [
"models_dict['Random Forest Regressor'] = random_forest_regressor_model"
]
},
{
"cell_type": "code",
"execution_count": 44,
"id": "4202e128",
"metadata": {},
"outputs": [
{
"data": {
"image/svg+xml": [
"\n",
"\n",
"\n"
],
"text/plain": [
""
]
},
"metadata": {
"needs_background": "light"
},
"output_type": "display_data"
}
],
"source": [
"model_test(random_forest_regressor_model,\n",
" 'Random Forest Regressor', X_test, y_test)"
]
},
{
"cell_type": "markdown",
"id": "0497dd8a",
"metadata": {},
"source": [
"## Gradient Boosting Regressor"
]
},
{
"cell_type": "code",
"execution_count": 45,
"id": "7ee99f3f",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"GradientBoostingRegressor()"
]
},
"execution_count": 45,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"gradient_boosting_regressor_model = GradientBoostingRegressor()\n",
"gradient_boosting_regressor_model.fit(X_train, y_train)"
]
},
{
"cell_type": "code",
"execution_count": 46,
"id": "b8f7f41c",
"metadata": {},
"outputs": [],
"source": [
"models_dict['Gradient Boosting Regressor'] = gradient_boosting_regressor_model"
]
},
{
"cell_type": "code",
"execution_count": 47,
"id": "2615039b",
"metadata": {},
"outputs": [
{
"data": {
"image/svg+xml": [
"\n",
"\n",
"\n"
],
"text/plain": [
""
]
},
"metadata": {
"needs_background": "light"
},
"output_type": "display_data"
}
],
"source": [
"model_test(gradient_boosting_regressor_model,\n",
" 'Gradient Boosting Regressor', X_test, y_test)"
]
},
{
"cell_type": "markdown",
"id": "38106a47",
"metadata": {},
"source": [
"## LassoCV"
]
},
{
"cell_type": "code",
"execution_count": 48,
"id": "01cbb842",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"LassoCV()"
]
},
"execution_count": 48,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"lasso_cv_model = LassoCV()\n",
"lasso_cv_model.fit(X_train, y_train)"
]
},
{
"cell_type": "code",
"execution_count": 49,
"id": "3ace9270",
"metadata": {},
"outputs": [],
"source": [
"models_dict['LassoCV'] = lasso_cv_model"
]
},
{
"cell_type": "code",
"execution_count": 50,
"id": "4b6b72e3",
"metadata": {},
"outputs": [
{
"data": {
"image/svg+xml": [
"\n",
"\n",
"\n"
],
"text/plain": [
""
]
},
"metadata": {
"needs_background": "light"
},
"output_type": "display_data"
}
],
"source": [
"model_test(lasso_cv_model, 'LassoCV', X_test, y_test)"
]
},
{
"cell_type": "markdown",
"id": "d5a914f8",
"metadata": {},
"source": [
"## LGBMRegressor"
]
},
{
"cell_type": "code",
"execution_count": 51,
"id": "1d5b6f17",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"LGBMRegressor()"
]
},
"execution_count": 51,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"lgbm_regressor_model = LGBMRegressor()\n",
"lgbm_regressor_model.fit(X_train, y_train)"
]
},
{
"cell_type": "code",
"execution_count": 56,
"id": "422c9904",
"metadata": {},
"outputs": [
{
"data": {
"image/svg+xml": [
"\n",
"\n",
"\n"
],
"text/plain": [
""
]
},
"metadata": {
"needs_background": "light"
},
"output_type": "display_data"
}
],
"source": [
"model_test(lgbm_regressor_model, 'LGBMRegressor', X_test, y_test)"
]
},
{
"cell_type": "markdown",
"id": "99c110c9",
"metadata": {},
"source": [
"#### Tunning LGBMRegressor"
]
},
{
"cell_type": "code",
"execution_count": 54,
"id": "daada10a",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
""
]
},
"execution_count": 54,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"lgbm_regressor_model.get_params"
]
},
{
"cell_type": "code",
"execution_count": 55,
"id": "e7bccecb",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"array([0.01, 0.02, 0.03, 0.04])"
]
},
"execution_count": 55,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"np.arange(0.01, 0.05, 0.01)"
]
},
{
"cell_type": "code",
"execution_count": 58,
"id": "66062abb",
"metadata": {},
"outputs": [],
"source": [
"parameters = [{\n",
" 'max_bin': np.arange(90, 120, 10),\n",
" 'n_estimators': np.arange(4000, 7000, 1000),\n",
" 'learning_rate': np.arange(0.01, 0.05, 0.01)\n",
"}]\n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": 59,
"id": "53ab888a",
"metadata": {},
"outputs": [],
"source": [
"clf = GridSearchCV(\n",
" estimator=LGBMRegressor(random_state=42),\n",
" param_grid=parameters,\n",
" scoring='neg_mean_squared_error',\n",
" cv=4,\n",
" n_jobs=-1,\n",
")"
]
},
{
"cell_type": "code",
"execution_count": 60,
"id": "40ad3650",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"GridSearchCV(cv=4, estimator=LGBMRegressor(random_state=42), n_jobs=-1,\n",
" param_grid=[{'learning_rate': array([0.01, 0.02, 0.03, 0.04]),\n",
" 'max_bin': array([ 90, 100, 110]),\n",
" 'n_estimators': array([4000, 5000, 6000])}],\n",
" scoring='neg_mean_squared_error')"
]
},
"execution_count": 60,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"clf.fit(X_train, y_train)"
]
},
{
"cell_type": "code",
"execution_count": 61,
"id": "9fb4f9d1",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" param_learning_rate | \n",
" param_max_bin | \n",
" param_n_estimators | \n",
" mean_test_score | \n",
"
\n",
" \n",
" \n",
" \n",
" | 6 | \n",
" 0.01 | \n",
" 110 | \n",
" 4000 | \n",
" -2.088079e+09 | \n",
"
\n",
" \n",
" | 7 | \n",
" 0.01 | \n",
" 110 | \n",
" 5000 | \n",
" -2.109738e+09 | \n",
"
\n",
" \n",
" | 0 | \n",
" 0.01 | \n",
" 90 | \n",
" 4000 | \n",
" -2.110668e+09 | \n",
"
\n",
" \n",
" | 3 | \n",
" 0.01 | \n",
" 100 | \n",
" 4000 | \n",
" -2.116463e+09 | \n",
"
\n",
" \n",
" | 8 | \n",
" 0.01 | \n",
" 110 | \n",
" 6000 | \n",
" -2.128995e+09 | \n",
"
\n",
" \n",
" | 1 | \n",
" 0.01 | \n",
" 90 | \n",
" 5000 | \n",
" -2.131810e+09 | \n",
"
\n",
" \n",
" | 4 | \n",
" 0.01 | \n",
" 100 | \n",
" 5000 | \n",
" -2.135978e+09 | \n",
"
\n",
" \n",
" | 2 | \n",
" 0.01 | \n",
" 90 | \n",
" 6000 | \n",
" -2.150383e+09 | \n",
"
\n",
" \n",
" | 5 | \n",
" 0.01 | \n",
" 100 | \n",
" 6000 | \n",
" -2.153554e+09 | \n",
"
\n",
" \n",
" | 15 | \n",
" 0.02 | \n",
" 110 | \n",
" 4000 | \n",
" -2.163152e+09 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" param_learning_rate param_max_bin param_n_estimators mean_test_score\n",
"6 0.01 110 4000 -2.088079e+09\n",
"7 0.01 110 5000 -2.109738e+09\n",
"0 0.01 90 4000 -2.110668e+09\n",
"3 0.01 100 4000 -2.116463e+09\n",
"8 0.01 110 6000 -2.128995e+09\n",
"1 0.01 90 5000 -2.131810e+09\n",
"4 0.01 100 5000 -2.135978e+09\n",
"2 0.01 90 6000 -2.150383e+09\n",
"5 0.01 100 6000 -2.153554e+09\n",
"15 0.02 110 4000 -2.163152e+09"
]
},
"execution_count": 61,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"cv_results = pd.DataFrame(clf.cv_results_)\n",
"param_columns = [\n",
" column\n",
" for column in cv_results.columns\n",
" if column.startswith('param_')\n",
"]\n",
"\n",
"score_columns = ['mean_test_score']\n",
"\n",
"cv_results = (cv_results[param_columns + score_columns]\n",
" .sort_values(by=score_columns, ascending=False))\n",
"\n",
"cv_results.head(10)"
]
},
{
"cell_type": "code",
"execution_count": 62,
"id": "9ec4c48e",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"{'learning_rate': 0.01, 'max_bin': 110, 'n_estimators': 4000}"
]
},
"execution_count": 62,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"clf.best_params_"
]
},
{
"cell_type": "markdown",
"id": "e4dde9fb",
"metadata": {},
"source": [
"#### Test tunning LGBMRegressor"
]
},
{
"cell_type": "code",
"execution_count": 63,
"id": "81c1ad27",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"LGBMRegressor(learning_rate=0.01, max_bin=110, n_estimators=4000, num_leaves=4)"
]
},
"execution_count": 63,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"lgbm_regressor_model = LGBMRegressor(\n",
" max_bin=110,\n",
" num_leaves=4,\n",
" n_estimators=4000,\n",
" learning_rate=0.01\n",
")\n",
"lgbm_regressor_model.fit(X_train, y_train)"
]
},
{
"cell_type": "code",
"execution_count": 64,
"id": "284d4b6d",
"metadata": {},
"outputs": [
{
"data": {
"image/svg+xml": [
"\n",
"\n",
"\n"
],
"text/plain": [
""
]
},
"metadata": {
"needs_background": "light"
},
"output_type": "display_data"
}
],
"source": [
"model_test(lgbm_regressor_model, 'LGBMRegressor', X_test, y_test)"
]
},
{
"cell_type": "code",
"execution_count": 65,
"id": "4dd74a5a",
"metadata": {},
"outputs": [],
"source": [
"models_dict['LGBMRegressor'] = lgbm_regressor_model"
]
},
{
"cell_type": "markdown",
"id": "6e1b3c14",
"metadata": {},
"source": [
"### XGBRegressor"
]
},
{
"cell_type": "code",
"execution_count": 66,
"id": "20ba592a",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"XGBRegressor(base_score=0.5, booster='gbtree', callbacks=None,\n",
" colsample_bylevel=1, colsample_bynode=1, colsample_bytree=1,\n",
" early_stopping_rounds=None, enable_categorical=False,\n",
" eval_metric=None, feature_types=None, gamma=0, gpu_id=-1,\n",
" grow_policy='depthwise', importance_type=None,\n",
" interaction_constraints='', learning_rate=0.300000012, max_bin=256,\n",
" max_cat_threshold=64, max_cat_to_onehot=4, max_delta_step=0,\n",
" max_depth=6, max_leaves=0, min_child_weight=1, missing=nan,\n",
" monotone_constraints='()', n_estimators=100, n_jobs=0,\n",
" num_parallel_tree=1, predictor='auto', random_state=0, ...)"
]
},
"execution_count": 66,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"xgboost_model = XGBRegressor()\n",
"xgboost_model.fit(X_train, y_train)"
]
},
{
"cell_type": "code",
"execution_count": 67,
"id": "e659f3d7",
"metadata": {},
"outputs": [],
"source": [
"models_dict['XGBRegressor'] = xgboost_model"
]
},
{
"cell_type": "code",
"execution_count": 68,
"id": "40264a30",
"metadata": {},
"outputs": [
{
"data": {
"image/svg+xml": [
"\n",
"\n",
"\n"
],
"text/plain": [
""
]
},
"metadata": {
"needs_background": "light"
},
"output_type": "display_data"
}
],
"source": [
"model_test(xgboost_model, 'XGBRegressor', X_test, y_test)"
]
},
{
"cell_type": "markdown",
"id": "c5c6e349",
"metadata": {},
"source": [
"# Резульитаты"
]
},
{
"cell_type": "markdown",
"id": "7c10701a",
"metadata": {},
"source": [
"найдем лучшее"
]
},
{
"cell_type": "code",
"execution_count": 130,
"id": "10e6da3c",
"metadata": {},
"outputs": [],
"source": [
"models_score_test = models_r2(models_dict, X_test, y_test)\n",
"models_score_train = models_r2(models_dict, X_train, y_train)"
]
},
{
"cell_type": "code",
"execution_count": 131,
"id": "6ccb6d3f",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" name | \n",
" r2 | \n",
"
\n",
" \n",
" \n",
" \n",
" | 2 | \n",
" Gradient Boosting Regressor | \n",
" 0.752612 | \n",
"
\n",
" \n",
" | 4 | \n",
" LGBMRegressor | \n",
" 0.752397 | \n",
"
\n",
" \n",
" | 1 | \n",
" Random Forest Regressor | \n",
" 0.751699 | \n",
"
\n",
" \n",
" | 5 | \n",
" XGBRegressor | \n",
" 0.743216 | \n",
"
\n",
" \n",
" | 0 | \n",
" Linear Regression | \n",
" 0.667146 | \n",
"
\n",
" \n",
" | 3 | \n",
" LassoCV | \n",
" 0.652968 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" name r2\n",
"2 Gradient Boosting Regressor 0.752612\n",
"4 LGBMRegressor 0.752397\n",
"1 Random Forest Regressor 0.751699\n",
"5 XGBRegressor 0.743216\n",
"0 Linear Regression 0.667146\n",
"3 LassoCV 0.652968"
]
},
"execution_count": 131,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"models_score_test[['name', 'r2']]"
]
},
{
"cell_type": "code",
"execution_count": 132,
"id": "d55a394e",
"metadata": {},
"outputs": [
{
"data": {
"image/svg+xml": [
"\n",
"\n",
"\n"
],
"text/plain": [
""
]
},
"metadata": {
"needs_background": "light"
},
"output_type": "display_data"
}
],
"source": [
"r2_max_test = models_score_test['r2'].max()\n",
"r2_max_train = models_score_train['r2'].max()\n",
"plt.barh(models_score_test['name'], models_score_test['r2'],\n",
" alpha=0.5, color='red', label=f'Test Data: R2 max: {r2_max_test:.4f}')\n",
"plt.barh(models_score_train['name'], models_score_train['r2'],\n",
" alpha=0.5, color='grey', label=f'Train Data: R2 max: {r2_max_train:.4f}')\n",
"plt.title('R2')\n",
"plt.legend()\n",
"plt.axvline(0.6, color='red')\n",
"plt.axvline(r2_max_test, color='yellow')\n",
"plt.show()"
]
},
{
"cell_type": "code",
"execution_count": 133,
"id": "6ea21b25",
"metadata": {},
"outputs": [
{
"data": {
"image/svg+xml": [
"\n",
"\n",
"\n"
],
"text/plain": [
""
]
},
"metadata": {
"needs_background": "light"
},
"output_type": "display_data"
}
],
"source": [
"mse_min_test = models_score_test['mse'].min()\n",
"mse_min_train = models_score_train['mse'].min()\n",
"plt.barh(models_score_test['name'], models_score_test['mse'],\n",
" alpha=0.5, color='red', label=f'Test Data MSE min: {mse_min_test:.0e}')\n",
"plt.barh(models_score_train['name'], models_score_train['mse'],\n",
" alpha=0.5, color='grey', label=f'Train Data MSE min: {mse_min_train:.0e}')\n",
"plt.title('Mean squared error')\n",
"plt.legend(loc=2)\n",
"plt.axvline(mse_min_test, color='yellow')\n",
"plt.show()"
]
},
{
"cell_type": "markdown",
"id": "39b6cf09",
"metadata": {},
"source": [
"### Выбор\n",
"Gradient Boosting Regressor - не применяем по условию задания!\n",
"выберем следуюющий это \"LGBMRegressor\""
]
},
{
"cell_type": "code",
"execution_count": 134,
"id": "dbd2dc85",
"metadata": {},
"outputs": [],
"source": [
"best_model = models_dict['LGBMRegressor']"
]
},
{
"cell_type": "code",
"execution_count": 135,
"id": "58539fae",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" name | \n",
" importances | \n",
"
\n",
" \n",
" \n",
" \n",
" | 0 | \n",
" Rooms | \n",
" 322 | \n",
"
\n",
" \n",
" | 1 | \n",
" Square | \n",
" 1492 | \n",
"
\n",
" \n",
" | 2 | \n",
" LifeSquare | \n",
" 927 | \n",
"
\n",
" \n",
" | 3 | \n",
" KitchenSquare | \n",
" 815 | \n",
"
\n",
" \n",
" | 4 | \n",
" Floor | \n",
" 573 | \n",
"
\n",
" \n",
" | 5 | \n",
" HouseFloor | \n",
" 920 | \n",
"
\n",
" \n",
" | 6 | \n",
" HouseYear | \n",
" 1502 | \n",
"
\n",
" \n",
" | 7 | \n",
" Ecology_1 | \n",
" 1254 | \n",
"
\n",
" \n",
" | 8 | \n",
" Social_1 | \n",
" 614 | \n",
"
\n",
" \n",
" | 9 | \n",
" Social_2 | \n",
" 199 | \n",
"
\n",
" \n",
" | 10 | \n",
" Social_3 | \n",
" 364 | \n",
"
\n",
" \n",
" | 11 | \n",
" Healthcare_1 | \n",
" 740 | \n",
"
\n",
" \n",
" | 12 | \n",
" Helthcare_2 | \n",
" 143 | \n",
"
\n",
" \n",
" | 13 | \n",
" Shops_1 | \n",
" 273 | \n",
"
\n",
" \n",
" | 14 | \n",
" PriceForMetr | \n",
" 1862 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" name importances\n",
"0 Rooms 322\n",
"1 Square 1492\n",
"2 LifeSquare 927\n",
"3 KitchenSquare 815\n",
"4 Floor 573\n",
"5 HouseFloor 920\n",
"6 HouseYear 1502\n",
"7 Ecology_1 1254\n",
"8 Social_1 614\n",
"9 Social_2 199\n",
"10 Social_3 364\n",
"11 Healthcare_1 740\n",
"12 Helthcare_2 143\n",
"13 Shops_1 273\n",
"14 PriceForMetr 1862"
]
},
"execution_count": 135,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"pd.DataFrame({'name': list(X_train.columns),\n",
" 'importances': list(best_model.feature_importances_)})"
]
},
{
"cell_type": "code",
"execution_count": 136,
"id": "709fe251",
"metadata": {},
"outputs": [
{
"data": {
"image/svg+xml": [
"\n",
"\n",
"\n"
],
"text/plain": [
""
]
},
"metadata": {
"needs_background": "light"
},
"output_type": "display_data"
}
],
"source": [
"model_test(best_model, 'best_model', X_test, y_test)"
]
},
{
"cell_type": "markdown",
"id": "7f8f48ea",
"metadata": {},
"source": [
"### Вывод в файл"
]
},
{
"cell_type": "code",
"execution_count": 95,
"id": "960996db",
"metadata": {},
"outputs": [],
"source": [
"test_features = list(X_train.columns)"
]
},
{
"cell_type": "code",
"execution_count": 96,
"id": "769b58d1",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"RangeIndex: 5000 entries, 0 to 4999\n",
"Data columns (total 15 columns):\n",
" # Column Non-Null Count Dtype \n",
"--- ------ -------------- ----- \n",
" 0 Rooms 5000 non-null uint8 \n",
" 1 Square 5000 non-null float32\n",
" 2 LifeSquare 5000 non-null float32\n",
" 3 KitchenSquare 5000 non-null float32\n",
" 4 Floor 5000 non-null uint8 \n",
" 5 HouseFloor 5000 non-null uint8 \n",
" 6 HouseYear 5000 non-null uint16 \n",
" 7 Ecology_1 5000 non-null float32\n",
" 8 Social_1 5000 non-null uint8 \n",
" 9 Social_2 5000 non-null uint16 \n",
" 10 Social_3 5000 non-null uint8 \n",
" 11 Healthcare_1 5000 non-null float32\n",
" 12 Helthcare_2 5000 non-null uint8 \n",
" 13 Shops_1 5000 non-null uint8 \n",
" 14 PriceForMetr 5000 non-null float64\n",
"dtypes: float32(5), float64(1), uint16(2), uint8(7)\n",
"memory usage: 190.6 KB\n"
]
}
],
"source": [
"test[test_features].info()"
]
},
{
"cell_type": "code",
"execution_count": 97,
"id": "4a2e7bf3",
"metadata": {},
"outputs": [],
"source": [
"test['Price'] = best_model.predict(test[test_features])"
]
},
{
"cell_type": "code",
"execution_count": 119,
"id": "16dfa624",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"mu = 215623.41 and sigma = 80260.29\n"
]
},
{
"data": {
"image/svg+xml": [
"\n",
"\n",
"\n"
],
"text/plain": [
""
]
},
"metadata": {
"needs_background": "light"
},
"output_type": "display_data"
},
{
"data": {
"image/svg+xml": [
"\n",
"\n",
"\n"
],
"text/plain": [
""
]
},
"metadata": {
"needs_background": "light"
},
"output_type": "display_data"
}
],
"source": [
"price_log = np.log1p(test['Price'])\n",
"#sns.distplot(price_log, fit=norm)\n",
"sns.histplot(price_log, kde=True)\n",
"#sns.displot(price_log, kde=True)\n",
"\n",
"mu, sigma = norm.fit(test['Price'])\n",
"\n",
"print(f'mu = {mu:.2f} and sigma = {sigma:.2f}')\n",
"\n",
"plt.legend(\n",
" [f'Normal dist. ($\\mu=$ {mu:.2f} and $\\sigma=$ {sigma:.2f} )'], loc='best')\n",
"plt.ylabel('Frequency')\n",
"plt.title('Price distribution')\n",
"\n",
"# QQ-plot\n",
"fig = plt.figure()\n",
"res = stats.probplot(price_log, plot=plt)\n",
"plt.show()"
]
},
{
"cell_type": "code",
"execution_count": 137,
"id": "c1b045a3",
"metadata": {},
"outputs": [],
"source": [
"test[['Id', 'Price']].to_csv('DGudilin_predictions.csv', index=None)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "bdc710c9",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.12"
}
},
"nbformat": 4,
"nbformat_minor": 5
}