ckexp/lec4.ipynb

3414 lines
158 KiB
Plaintext
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

{
"cells": [
{
"cell_type": "markdown",
"metadata": {
"id": "Y5dMmHXIRYEg"
},
"source": [
"#### Загрузка и распаковка данных"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from urllib.request import urlretrieve\n",
"from zipfile import ZipFile\n",
"\n",
"ds_url = \"https://github.com/PacktPublishing/Interpretable-Machine-Learning-with-Python/raw/master/datasets/aa-domestic-delays-2018.csv.zip\"\n",
"ds_zip_filename = \"data/aa-domestic-delays-2018.csv.zip\"\n",
"urlretrieve(ds_url, ds_zip_filename)\n",
"\n",
"with ZipFile(ds_zip_filename, \"r\") as zObject:\n",
" zObject.extractall(path=\"data\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Загрузка данных в Dataframe"
]
},
{
"cell_type": "code",
"execution_count": 22,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "YvuKMosoRY7K",
"outputId": "f9f05784-9c91-4869-a02e-e1ccc8115a8d"
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"<class 'pandas.core.frame.DataFrame'>\n",
"RangeIndex: 899527 entries, 0 to 899526\n",
"Data columns (total 23 columns):\n",
" # Column Non-Null Count Dtype \n",
"--- ------ -------------- ----- \n",
" 0 FL_NUM 899527 non-null int64 \n",
" 1 ORIGIN 899527 non-null object \n",
" 2 DEST 899527 non-null object \n",
" 3 PLANNED_DEP_DATETIME 899527 non-null object \n",
" 4 CRS_DEP_TIME 899527 non-null int64 \n",
" 5 DEP_TIME 899527 non-null float64\n",
" 6 DEP_DELAY 899527 non-null float64\n",
" 7 DEP_AFPH 899527 non-null float64\n",
" 8 DEP_RFPH 899527 non-null float64\n",
" 9 TAXI_OUT 899527 non-null float64\n",
" 10 WHEELS_OFF 899527 non-null float64\n",
" 11 CRS_ELAPSED_TIME 899527 non-null float64\n",
" 12 PCT_ELAPSED_TIME 899527 non-null float64\n",
" 13 DISTANCE 899527 non-null float64\n",
" 14 CRS_ARR_TIME 899527 non-null int64 \n",
" 15 ARR_AFPH 899527 non-null float64\n",
" 16 ARR_RFPH 899527 non-null float64\n",
" 17 ARR_DELAY 899527 non-null float64\n",
" 18 CARRIER_DELAY 899527 non-null float64\n",
" 19 WEATHER_DELAY 899527 non-null float64\n",
" 20 NAS_DELAY 899527 non-null float64\n",
" 21 SECURITY_DELAY 899527 non-null float64\n",
" 22 LATE_AIRCRAFT_DELAY 899527 non-null float64\n",
"dtypes: float64(17), int64(3), object(3)\n",
"memory usage: 157.8+ MB\n"
]
}
],
"source": [
"import pandas as pd\n",
"\n",
"orig_df = pd.read_csv(\"data/aa-domestic-delays-2018.csv\")\n",
"orig_df.info()"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "Hen4OWkxSEsb"
},
"source": [
"#### Подготовка данных и конструирование признаков"
]
},
{
"cell_type": "code",
"execution_count": 23,
"metadata": {
"id": "qnyD6ZeLSGwL"
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>CRS_DEP_TIME</th>\n",
" <th>DEP_TIME</th>\n",
" <th>DEP_DELAY</th>\n",
" <th>DEP_AFPH</th>\n",
" <th>DEP_RFPH</th>\n",
" <th>TAXI_OUT</th>\n",
" <th>WHEELS_OFF</th>\n",
" <th>CRS_ELAPSED_TIME</th>\n",
" <th>PCT_ELAPSED_TIME</th>\n",
" <th>DISTANCE</th>\n",
" <th>...</th>\n",
" <th>ARR_RFPH</th>\n",
" <th>CARRIER_DELAY</th>\n",
" <th>WEATHER_DELAY</th>\n",
" <th>NAS_DELAY</th>\n",
" <th>SECURITY_DELAY</th>\n",
" <th>LATE_AIRCRAFT_DELAY</th>\n",
" <th>DEP_MONTH</th>\n",
" <th>DEP_DOW</th>\n",
" <th>ORIGIN_HUB</th>\n",
" <th>DEST_HUB</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>1155</td>\n",
" <td>1149.0</td>\n",
" <td>-6.0</td>\n",
" <td>34.444444</td>\n",
" <td>0.956790</td>\n",
" <td>14.0</td>\n",
" <td>1203.0</td>\n",
" <td>219.0</td>\n",
" <td>0.963470</td>\n",
" <td>1192.0</td>\n",
" <td>...</td>\n",
" <td>0.854573</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>705</td>\n",
" <td>700.0</td>\n",
" <td>-5.0</td>\n",
" <td>17.454545</td>\n",
" <td>0.242424</td>\n",
" <td>16.0</td>\n",
" <td>716.0</td>\n",
" <td>171.0</td>\n",
" <td>0.918129</td>\n",
" <td>1192.0</td>\n",
" <td>...</td>\n",
" <td>0.731707</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>1148</td>\n",
" <td>1145.0</td>\n",
" <td>-3.0</td>\n",
" <td>94.736842</td>\n",
" <td>0.947368</td>\n",
" <td>14.0</td>\n",
" <td>1159.0</td>\n",
" <td>212.0</td>\n",
" <td>0.971698</td>\n",
" <td>1558.0</td>\n",
" <td>...</td>\n",
" <td>1.092437</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>825</td>\n",
" <td>824.0</td>\n",
" <td>-1.0</td>\n",
" <td>33.559322</td>\n",
" <td>0.860495</td>\n",
" <td>16.0</td>\n",
" <td>840.0</td>\n",
" <td>271.0</td>\n",
" <td>0.918819</td>\n",
" <td>1558.0</td>\n",
" <td>...</td>\n",
" <td>0.867379</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>1155</td>\n",
" <td>1147.0</td>\n",
" <td>-8.0</td>\n",
" <td>33.461538</td>\n",
" <td>0.929487</td>\n",
" <td>13.0</td>\n",
" <td>1200.0</td>\n",
" <td>99.0</td>\n",
" <td>0.969697</td>\n",
" <td>331.0</td>\n",
" <td>...</td>\n",
" <td>1.006803</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>899522</th>\n",
" <td>1534</td>\n",
" <td>1530.0</td>\n",
" <td>-4.0</td>\n",
" <td>35.357143</td>\n",
" <td>0.822259</td>\n",
" <td>20.0</td>\n",
" <td>1550.0</td>\n",
" <td>100.0</td>\n",
" <td>0.990000</td>\n",
" <td>331.0</td>\n",
" <td>...</td>\n",
" <td>0.837945</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>12</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>899523</th>\n",
" <td>1751</td>\n",
" <td>1757.0</td>\n",
" <td>6.0</td>\n",
" <td>71.818182</td>\n",
" <td>1.040843</td>\n",
" <td>18.0</td>\n",
" <td>1815.0</td>\n",
" <td>181.0</td>\n",
" <td>0.972376</td>\n",
" <td>936.0</td>\n",
" <td>...</td>\n",
" <td>0.697674</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>12</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>899524</th>\n",
" <td>2015</td>\n",
" <td>2010.0</td>\n",
" <td>-5.0</td>\n",
" <td>63.272727</td>\n",
" <td>1.193825</td>\n",
" <td>36.0</td>\n",
" <td>2046.0</td>\n",
" <td>112.0</td>\n",
" <td>1.142857</td>\n",
" <td>511.0</td>\n",
" <td>...</td>\n",
" <td>0.482897</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>12</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>899525</th>\n",
" <td>1300</td>\n",
" <td>1323.0</td>\n",
" <td>23.0</td>\n",
" <td>70.843373</td>\n",
" <td>0.770037</td>\n",
" <td>11.0</td>\n",
" <td>1334.0</td>\n",
" <td>50.0</td>\n",
" <td>0.820000</td>\n",
" <td>130.0</td>\n",
" <td>...</td>\n",
" <td>0.888031</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>12</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>899526</th>\n",
" <td>1435</td>\n",
" <td>1443.0</td>\n",
" <td>8.0</td>\n",
" <td>19.411765</td>\n",
" <td>0.924370</td>\n",
" <td>8.0</td>\n",
" <td>1451.0</td>\n",
" <td>71.0</td>\n",
" <td>0.830986</td>\n",
" <td>130.0</td>\n",
" <td>...</td>\n",
" <td>1.011905</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>12</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>899527 rows × 22 columns</p>\n",
"</div>"
],
"text/plain": [
" CRS_DEP_TIME DEP_TIME DEP_DELAY DEP_AFPH DEP_RFPH TAXI_OUT \\\n",
"0 1155 1149.0 -6.0 34.444444 0.956790 14.0 \n",
"1 705 700.0 -5.0 17.454545 0.242424 16.0 \n",
"2 1148 1145.0 -3.0 94.736842 0.947368 14.0 \n",
"3 825 824.0 -1.0 33.559322 0.860495 16.0 \n",
"4 1155 1147.0 -8.0 33.461538 0.929487 13.0 \n",
"... ... ... ... ... ... ... \n",
"899522 1534 1530.0 -4.0 35.357143 0.822259 20.0 \n",
"899523 1751 1757.0 6.0 71.818182 1.040843 18.0 \n",
"899524 2015 2010.0 -5.0 63.272727 1.193825 36.0 \n",
"899525 1300 1323.0 23.0 70.843373 0.770037 11.0 \n",
"899526 1435 1443.0 8.0 19.411765 0.924370 8.0 \n",
"\n",
" WHEELS_OFF CRS_ELAPSED_TIME PCT_ELAPSED_TIME DISTANCE ... \\\n",
"0 1203.0 219.0 0.963470 1192.0 ... \n",
"1 716.0 171.0 0.918129 1192.0 ... \n",
"2 1159.0 212.0 0.971698 1558.0 ... \n",
"3 840.0 271.0 0.918819 1558.0 ... \n",
"4 1200.0 99.0 0.969697 331.0 ... \n",
"... ... ... ... ... ... \n",
"899522 1550.0 100.0 0.990000 331.0 ... \n",
"899523 1815.0 181.0 0.972376 936.0 ... \n",
"899524 2046.0 112.0 1.142857 511.0 ... \n",
"899525 1334.0 50.0 0.820000 130.0 ... \n",
"899526 1451.0 71.0 0.830986 130.0 ... \n",
"\n",
" ARR_RFPH CARRIER_DELAY WEATHER_DELAY NAS_DELAY SECURITY_DELAY \\\n",
"0 0.854573 0.0 0.0 0.0 0.0 \n",
"1 0.731707 0.0 0.0 0.0 0.0 \n",
"2 1.092437 0.0 0.0 0.0 0.0 \n",
"3 0.867379 0.0 0.0 0.0 0.0 \n",
"4 1.006803 0.0 0.0 0.0 0.0 \n",
"... ... ... ... ... ... \n",
"899522 0.837945 0.0 0.0 0.0 0.0 \n",
"899523 0.697674 0.0 0.0 0.0 0.0 \n",
"899524 0.482897 0.0 0.0 0.0 0.0 \n",
"899525 0.888031 0.0 0.0 0.0 0.0 \n",
"899526 1.011905 0.0 0.0 0.0 0.0 \n",
"\n",
" LATE_AIRCRAFT_DELAY DEP_MONTH DEP_DOW ORIGIN_HUB DEST_HUB \n",
"0 0.0 1 0 1 1 \n",
"1 0.0 1 0 1 1 \n",
"2 0.0 1 0 0 1 \n",
"3 0.0 1 0 1 0 \n",
"4 0.0 1 0 1 1 \n",
"... ... ... ... ... ... \n",
"899522 0.0 12 0 1 1 \n",
"899523 0.0 12 0 1 1 \n",
"899524 0.0 12 0 1 0 \n",
"899525 0.0 12 0 1 0 \n",
"899526 0.0 12 0 0 1 \n",
"\n",
"[899527 rows x 22 columns]"
]
},
"execution_count": 23,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df = orig_df.copy()\n",
"# Преобразование даты из строки в datetime\n",
"df[\"PLANNED_DEP_DATETIME\"] = pd.to_datetime(df[\"PLANNED_DEP_DATETIME\"])\n",
"# Получение месяца и дня недели вылета из даты для учета сезонности и особенностей дня недели\n",
"df[\"DEP_MONTH\"] = df[\"PLANNED_DEP_DATETIME\"].dt.month\n",
"df[\"DEP_DOW\"] = df[\"PLANNED_DEP_DATETIME\"].dt.dayofweek\n",
"# Удаление столбца с датой\n",
"df = df.drop([\"PLANNED_DEP_DATETIME\"], axis=1)\n",
"# Список аэропортов-хабов\n",
"hubs = [\"CLT\", \"ORD\", \"DFW\", \"LAX\", \"MIA\", \"JFK\", \"LGA\", \"PHL\", \"PHX\", \"DCA\"]\n",
"# Определение признака хаба для аэропортов вылета и назначения\n",
"is_origin_hub = df[\"ORIGIN\"].isin(hubs)\n",
"is_dest_hub = df[\"DEST\"].isin(hubs)\n",
"# Установка признака хаба для данных\n",
"df[\"ORIGIN_HUB\"] = 0\n",
"df.loc[is_origin_hub, \"ORIGIN_HUB\"] = 1\n",
"df[\"DEST_HUB\"] = 0\n",
"df.loc[is_dest_hub, \"DEST_HUB\"] = 1\n",
"# Удаление лишних столбцов\n",
"df = df.drop([\"FL_NUM\", \"ORIGIN\", \"DEST\"], axis=1)\n",
"# Удаление столбца с общим временем задержки прибытия, так как данные значения будут иметь сильное влияние на результат\n",
"df = df.drop([\"ARR_DELAY\"], axis=1)\n",
"df"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "y6f3Z4UwUAUI"
},
"source": [
"#### Формирование тестовой и обучающей выборок данных"
]
},
{
"cell_type": "code",
"execution_count": 24,
"metadata": {
"id": "6LudeeYUUEPt"
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>CRS_DEP_TIME</th>\n",
" <th>DEP_TIME</th>\n",
" <th>DEP_DELAY</th>\n",
" <th>DEP_AFPH</th>\n",
" <th>DEP_RFPH</th>\n",
" <th>TAXI_OUT</th>\n",
" <th>WHEELS_OFF</th>\n",
" <th>CRS_ELAPSED_TIME</th>\n",
" <th>PCT_ELAPSED_TIME</th>\n",
" <th>DISTANCE</th>\n",
" <th>...</th>\n",
" <th>ARR_AFPH</th>\n",
" <th>ARR_RFPH</th>\n",
" <th>WEATHER_DELAY</th>\n",
" <th>NAS_DELAY</th>\n",
" <th>SECURITY_DELAY</th>\n",
" <th>LATE_AIRCRAFT_DELAY</th>\n",
" <th>DEP_MONTH</th>\n",
" <th>DEP_DOW</th>\n",
" <th>ORIGIN_HUB</th>\n",
" <th>DEST_HUB</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>31121</th>\n",
" <td>845</td>\n",
" <td>842.0</td>\n",
" <td>-3.0</td>\n",
" <td>16.842105</td>\n",
" <td>0.443213</td>\n",
" <td>21.0</td>\n",
" <td>903.0</td>\n",
" <td>106.0</td>\n",
" <td>0.886792</td>\n",
" <td>331.0</td>\n",
" <td>...</td>\n",
" <td>85.333333</td>\n",
" <td>1.145414</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>1</td>\n",
" <td>6</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>633500</th>\n",
" <td>1315</td>\n",
" <td>1316.0</td>\n",
" <td>1.0</td>\n",
" <td>4.918033</td>\n",
" <td>0.983607</td>\n",
" <td>9.0</td>\n",
" <td>1325.0</td>\n",
" <td>121.0</td>\n",
" <td>1.107438</td>\n",
" <td>624.0</td>\n",
" <td>...</td>\n",
" <td>111.891892</td>\n",
" <td>1.286114</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>9</td>\n",
" <td>4</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>747737</th>\n",
" <td>1710</td>\n",
" <td>1704.0</td>\n",
" <td>-6.0</td>\n",
" <td>55.555556</td>\n",
" <td>1.028807</td>\n",
" <td>14.0</td>\n",
" <td>1718.0</td>\n",
" <td>67.0</td>\n",
" <td>1.074627</td>\n",
" <td>304.0</td>\n",
" <td>...</td>\n",
" <td>17.288136</td>\n",
" <td>0.751658</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>10</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>298943</th>\n",
" <td>1840</td>\n",
" <td>1920.0</td>\n",
" <td>40.0</td>\n",
" <td>28.200000</td>\n",
" <td>0.587500</td>\n",
" <td>18.0</td>\n",
" <td>1938.0</td>\n",
" <td>161.0</td>\n",
" <td>0.888199</td>\n",
" <td>852.0</td>\n",
" <td>...</td>\n",
" <td>38.780488</td>\n",
" <td>1.157627</td>\n",
" <td>22.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>5</td>\n",
" <td>5</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>843932</th>\n",
" <td>1830</td>\n",
" <td>1822.0</td>\n",
" <td>-8.0</td>\n",
" <td>28.846154</td>\n",
" <td>0.901442</td>\n",
" <td>13.0</td>\n",
" <td>1835.0</td>\n",
" <td>215.0</td>\n",
" <td>0.930233</td>\n",
" <td>1192.0</td>\n",
" <td>...</td>\n",
" <td>90.810811</td>\n",
" <td>1.121121</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>12</td>\n",
" <td>5</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>720822</th>\n",
" <td>1359</td>\n",
" <td>1410.0</td>\n",
" <td>11.0</td>\n",
" <td>53.239437</td>\n",
" <td>0.995130</td>\n",
" <td>26.0</td>\n",
" <td>1436.0</td>\n",
" <td>147.0</td>\n",
" <td>1.006803</td>\n",
" <td>814.0</td>\n",
" <td>...</td>\n",
" <td>25.000000</td>\n",
" <td>1.136364</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>10</td>\n",
" <td>4</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>459253</th>\n",
" <td>2209</td>\n",
" <td>2207.0</td>\n",
" <td>-2.0</td>\n",
" <td>80.689655</td>\n",
" <td>1.021388</td>\n",
" <td>16.0</td>\n",
" <td>2223.0</td>\n",
" <td>86.0</td>\n",
" <td>0.918605</td>\n",
" <td>413.0</td>\n",
" <td>...</td>\n",
" <td>2.352941</td>\n",
" <td>1.176471</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>7</td>\n",
" <td>6</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>711294</th>\n",
" <td>530</td>\n",
" <td>523.0</td>\n",
" <td>-7.0</td>\n",
" <td>12.452830</td>\n",
" <td>0.830189</td>\n",
" <td>17.0</td>\n",
" <td>540.0</td>\n",
" <td>119.0</td>\n",
" <td>0.957983</td>\n",
" <td>666.0</td>\n",
" <td>...</td>\n",
" <td>37.500000</td>\n",
" <td>0.892857</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>10</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>872796</th>\n",
" <td>709</td>\n",
" <td>706.0</td>\n",
" <td>-3.0</td>\n",
" <td>120.000000</td>\n",
" <td>1.030043</td>\n",
" <td>12.0</td>\n",
" <td>718.0</td>\n",
" <td>169.0</td>\n",
" <td>1.295858</td>\n",
" <td>1120.0</td>\n",
" <td>...</td>\n",
" <td>12.336449</td>\n",
" <td>0.850790</td>\n",
" <td>0.0</td>\n",
" <td>47.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>12</td>\n",
" <td>3</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>516478</th>\n",
" <td>1925</td>\n",
" <td>2030.0</td>\n",
" <td>65.0</td>\n",
" <td>14.880000</td>\n",
" <td>1.352727</td>\n",
" <td>8.0</td>\n",
" <td>2038.0</td>\n",
" <td>135.0</td>\n",
" <td>0.977778</td>\n",
" <td>761.0</td>\n",
" <td>...</td>\n",
" <td>93.442623</td>\n",
" <td>1.112412</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>62.0</td>\n",
" <td>7</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>764597 rows × 21 columns</p>\n",
"</div>"
],
"text/plain": [
" CRS_DEP_TIME DEP_TIME DEP_DELAY DEP_AFPH DEP_RFPH TAXI_OUT \\\n",
"31121 845 842.0 -3.0 16.842105 0.443213 21.0 \n",
"633500 1315 1316.0 1.0 4.918033 0.983607 9.0 \n",
"747737 1710 1704.0 -6.0 55.555556 1.028807 14.0 \n",
"298943 1840 1920.0 40.0 28.200000 0.587500 18.0 \n",
"843932 1830 1822.0 -8.0 28.846154 0.901442 13.0 \n",
"... ... ... ... ... ... ... \n",
"720822 1359 1410.0 11.0 53.239437 0.995130 26.0 \n",
"459253 2209 2207.0 -2.0 80.689655 1.021388 16.0 \n",
"711294 530 523.0 -7.0 12.452830 0.830189 17.0 \n",
"872796 709 706.0 -3.0 120.000000 1.030043 12.0 \n",
"516478 1925 2030.0 65.0 14.880000 1.352727 8.0 \n",
"\n",
" WHEELS_OFF CRS_ELAPSED_TIME PCT_ELAPSED_TIME DISTANCE ... \\\n",
"31121 903.0 106.0 0.886792 331.0 ... \n",
"633500 1325.0 121.0 1.107438 624.0 ... \n",
"747737 1718.0 67.0 1.074627 304.0 ... \n",
"298943 1938.0 161.0 0.888199 852.0 ... \n",
"843932 1835.0 215.0 0.930233 1192.0 ... \n",
"... ... ... ... ... ... \n",
"720822 1436.0 147.0 1.006803 814.0 ... \n",
"459253 2223.0 86.0 0.918605 413.0 ... \n",
"711294 540.0 119.0 0.957983 666.0 ... \n",
"872796 718.0 169.0 1.295858 1120.0 ... \n",
"516478 2038.0 135.0 0.977778 761.0 ... \n",
"\n",
" ARR_AFPH ARR_RFPH WEATHER_DELAY NAS_DELAY SECURITY_DELAY \\\n",
"31121 85.333333 1.145414 0.0 0.0 0.0 \n",
"633500 111.891892 1.286114 0.0 0.0 0.0 \n",
"747737 17.288136 0.751658 0.0 0.0 0.0 \n",
"298943 38.780488 1.157627 22.0 0.0 0.0 \n",
"843932 90.810811 1.121121 0.0 0.0 0.0 \n",
"... ... ... ... ... ... \n",
"720822 25.000000 1.136364 0.0 0.0 0.0 \n",
"459253 2.352941 1.176471 0.0 0.0 0.0 \n",
"711294 37.500000 0.892857 0.0 0.0 0.0 \n",
"872796 12.336449 0.850790 0.0 47.0 0.0 \n",
"516478 93.442623 1.112412 0.0 0.0 0.0 \n",
"\n",
" LATE_AIRCRAFT_DELAY DEP_MONTH DEP_DOW ORIGIN_HUB DEST_HUB \n",
"31121 0.0 1 6 1 1 \n",
"633500 0.0 9 4 0 1 \n",
"747737 0.0 10 1 1 0 \n",
"298943 0.0 5 5 0 1 \n",
"843932 0.0 12 5 1 1 \n",
"... ... ... ... ... ... \n",
"720822 0.0 10 4 1 0 \n",
"459253 0.0 7 6 1 0 \n",
"711294 0.0 10 1 0 1 \n",
"872796 0.0 12 3 1 0 \n",
"516478 62.0 7 0 0 1 \n",
"\n",
"[764597 rows x 21 columns]"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/plain": [
"31121 0.0\n",
"633500 0.0\n",
"747737 0.0\n",
"298943 0.0\n",
"843932 0.0\n",
" ... \n",
"720822 0.0\n",
"459253 0.0\n",
"711294 0.0\n",
"872796 0.0\n",
"516478 0.0\n",
"Name: CARRIER_DELAY, Length: 764597, dtype: float64"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/plain": [
"31121 0\n",
"633500 0\n",
"747737 0\n",
"298943 0\n",
"843932 0\n",
" ..\n",
"720822 0\n",
"459253 0\n",
"711294 0\n",
"872796 0\n",
"516478 0\n",
"Name: CARRIER_DELAY, Length: 764597, dtype: int64"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"<class 'pandas.core.frame.DataFrame'>\n",
"Int64Index: 764597 entries, 31121 to 516478\n",
"Data columns (total 21 columns):\n",
" # Column Non-Null Count Dtype \n",
"--- ------ -------------- ----- \n",
" 0 CRS_DEP_TIME 764597 non-null int64 \n",
" 1 DEP_TIME 764597 non-null float64\n",
" 2 DEP_DELAY 764597 non-null float64\n",
" 3 DEP_AFPH 764597 non-null float64\n",
" 4 DEP_RFPH 764597 non-null float64\n",
" 5 TAXI_OUT 764597 non-null float64\n",
" 6 WHEELS_OFF 764597 non-null float64\n",
" 7 CRS_ELAPSED_TIME 764597 non-null float64\n",
" 8 PCT_ELAPSED_TIME 764597 non-null float64\n",
" 9 DISTANCE 764597 non-null float64\n",
" 10 CRS_ARR_TIME 764597 non-null int64 \n",
" 11 ARR_AFPH 764597 non-null float64\n",
" 12 ARR_RFPH 764597 non-null float64\n",
" 13 WEATHER_DELAY 764597 non-null float64\n",
" 14 NAS_DELAY 764597 non-null float64\n",
" 15 SECURITY_DELAY 764597 non-null float64\n",
" 16 LATE_AIRCRAFT_DELAY 764597 non-null float64\n",
" 17 DEP_MONTH 764597 non-null int64 \n",
" 18 DEP_DOW 764597 non-null int64 \n",
" 19 ORIGIN_HUB 764597 non-null int64 \n",
" 20 DEST_HUB 764597 non-null int64 \n",
"dtypes: float64(15), int64(6)\n",
"memory usage: 128.3 MB\n"
]
}
],
"source": [
"from sklearn.model_selection import train_test_split\n",
"\n",
"# Задание фиксированного случайного состояния для воспроизводимости результатов\n",
"rand = 9\n",
"# Выделение признака, который модель должна предсказать\n",
"y = df[\"CARRIER_DELAY\"]\n",
"# Формирование множества признаков, на основе которых модель будет обучаться (удаление столбца с y)\n",
"X = df.drop([\"CARRIER_DELAY\"], axis=1).copy()\n",
"X_train, X_test, y_train_reg, y_test_reg = train_test_split(\n",
" X, y, test_size=0.15, random_state=rand\n",
")\n",
"# Создание классов для классификаторов в виде двоичных меток (опоздание свыше 15 минут - 1, иначе - 0)\n",
"y_train_class = y_train_reg.apply(lambda x: 1 if x > 15 else 0)\n",
"y_test_class = y_test_reg.apply(lambda x: 1 if x > 15 else 0)\n",
"\n",
"display(X_train)\n",
"display(y_train_reg)\n",
"display(y_train_class)\n",
"X_train.info()"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "aVjZol-yrYbH"
},
"source": [
"#### Определение линейной корреляции признаков с целевым признаком с помощью корреляции Пирсона"
]
},
{
"cell_type": "code",
"execution_count": 25,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "_1e7XE_Orf0r",
"outputId": "efbb620c-c92b-481e-d613-a34633a4ba86"
},
"outputs": [
{
"data": {
"text/plain": [
"CARRIER_DELAY 1.000000\n",
"DEP_DELAY 0.703935\n",
"ARR_RFPH 0.101742\n",
"LATE_AIRCRAFT_DELAY 0.083166\n",
"DEP_RFPH 0.058659\n",
"ARR_AFPH 0.035135\n",
"DEP_TIME 0.030941\n",
"NAS_DELAY 0.026792\n",
"WHEELS_OFF 0.026787\n",
"TAXI_OUT 0.024635\n",
"PCT_ELAPSED_TIME 0.020980\n",
"CRS_DEP_TIME 0.016032\n",
"ORIGIN_HUB 0.015334\n",
"DEST_HUB 0.013932\n",
"DISTANCE 0.010680\n",
"DEP_MONTH 0.009728\n",
"CRS_ELAPSED_TIME 0.008801\n",
"DEP_DOW 0.007043\n",
"CRS_ARR_TIME 0.007029\n",
"DEP_AFPH 0.006053\n",
"WEATHER_DELAY 0.003002\n",
"SECURITY_DELAY 0.000460\n",
"Name: CARRIER_DELAY, dtype: float64"
]
},
"execution_count": 25,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"corr = df.corr()\n",
"abs(corr[\"CARRIER_DELAY\"]).sort_values(ascending=False)"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "xxPlvpOdrnY_"
},
"source": [
"#### Использование регрессионных моделей для предсказания задержки рейса"
]
},
{
"cell_type": "code",
"execution_count": 26,
"metadata": {
"id": "2m81QO87rzal"
},
"outputs": [],
"source": [
"from sklearn.pipeline import make_pipeline\n",
"from sklearn.preprocessing import PolynomialFeatures, StandardScaler\n",
"from sklearn import linear_model, tree, neighbors, ensemble, neural_network\n",
"\n",
"reg_models = {\n",
" # Обобщенные линейные модели (GLM-модели)\n",
" \"linear\": {\"model\": linear_model.LinearRegression(n_jobs=-1)},\n",
" \"linear_poly\": {\n",
" \"model\": make_pipeline(\n",
" PolynomialFeatures(degree=2, interaction_only=False),\n",
" linear_model.LinearRegression(fit_intercept=False, n_jobs=-1),\n",
" memory=None\n",
" )\n",
" },\n",
" \"linear_interact\": {\n",
" \"model\": make_pipeline(\n",
" PolynomialFeatures(degree=2, interaction_only=True),\n",
" linear_model.LinearRegression(fit_intercept=False, n_jobs=-1),\n",
" memory=None\n",
" )\n",
" },\n",
" \"ridge\": {\"model\": linear_model.RidgeCV()},\n",
" # Деревья\n",
" \"decision_tree\": {\n",
" \"model\": tree.DecisionTreeRegressor(max_depth=7, random_state=rand)\n",
" },\n",
" # Ближайшие соседи\n",
" \"knn\": {\"model\": neighbors.KNeighborsRegressor(n_neighbors=7, n_jobs=-1)},\n",
" # Ансамблевые методы\n",
" \"random_forest\": {\n",
" \"model\": ensemble.RandomForestRegressor(\n",
" max_depth=7, random_state=rand, n_jobs=-1\n",
" )\n",
" },\n",
" # Нейронные сети\n",
" \"mlp\": {\n",
" \"model\": neural_network.MLPRegressor(\n",
" hidden_layer_sizes=(21,),\n",
" max_iter=500,\n",
" early_stopping=True,\n",
" random_state=rand,\n",
" )\n",
" },\n",
"}"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "DFJudSP_tFec"
},
"source": [
"#### Обучение и оценка регрессионных моделей"
]
},
{
"cell_type": "code",
"execution_count": 27,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "XZEDVAR0tIzN",
"outputId": "4f1f55ff-85dd-49d9-ddaf-3758419bdd6a"
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Model: linear\n",
"Model: linear_poly\n",
"Model: linear_interact\n",
"Model: ridge\n",
"Model: decision_tree\n",
"Model: knn\n",
"Model: random_forest\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"0.00s - Debugger warning: It seems that frozen modules are being used, which may\n",
"0.00s - make the debugger miss breakpoints. Please pass -Xfrozen_modules=off\n",
"0.00s - to python to disable frozen modules.\n",
"0.00s - Note: Debugging will proceed. Set PYDEVD_DISABLE_FILE_VALIDATION=1 to disable this validation.\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Model: mlp\n"
]
}
],
"source": [
"import math\n",
"from sklearn import metrics\n",
"\n",
"for model_name in reg_models.keys():\n",
" print(f'Model: {model_name}')\n",
" fitted_model = reg_models[model_name][\"model\"].fit(\n",
" X_train.values, y_train_reg.to_numpy().ravel()\n",
" )\n",
" y_train_pred = fitted_model.predict(X_train.values)\n",
" y_test_pred = fitted_model.predict(X_test.values)\n",
" reg_models[model_name][\"fitted\"] = fitted_model\n",
" reg_models[model_name][\"preds\"] = y_test_pred\n",
" reg_models[model_name][\"RMSE_train\"] = math.sqrt(\n",
" metrics.mean_squared_error(y_train_reg, y_train_pred)\n",
" )\n",
" reg_models[model_name][\"RMSE_test\"] = math.sqrt(\n",
" metrics.mean_squared_error(y_test_reg, y_test_pred)\n",
" )\n",
" reg_models[model_name][\"R2_test\"] = metrics.r2_score(y_test_reg, y_test_pred)"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "6GD_HZhGHPXK"
},
"source": [
"#### Вывод оценки в виде таблицы"
]
},
{
"cell_type": "code",
"execution_count": 28,
"metadata": {
"id": "lvcbKDfmHQ6p"
},
"outputs": [
{
"data": {
"text/html": [
"<style type=\"text/css\">\n",
"#T_b990e_row0_col0, #T_b990e_row0_col1 {\n",
" background-color: #26818e;\n",
" color: #f1f1f1;\n",
"}\n",
"#T_b990e_row0_col2 {\n",
" background-color: #da5a6a;\n",
" color: #f1f1f1;\n",
"}\n",
"#T_b990e_row1_col0 {\n",
" background-color: #25ac82;\n",
" color: #f1f1f1;\n",
"}\n",
"#T_b990e_row1_col1 {\n",
" background-color: #2cb17e;\n",
" color: #f1f1f1;\n",
"}\n",
"#T_b990e_row1_col2 {\n",
" background-color: #b42e8d;\n",
" color: #f1f1f1;\n",
"}\n",
"#T_b990e_row2_col0 {\n",
" background-color: #4cc26c;\n",
" color: #000000;\n",
"}\n",
"#T_b990e_row2_col1 {\n",
" background-color: #32b67a;\n",
" color: #f1f1f1;\n",
"}\n",
"#T_b990e_row2_col2 {\n",
" background-color: #b02991;\n",
" color: #f1f1f1;\n",
"}\n",
"#T_b990e_row3_col0 {\n",
" background-color: #58c765;\n",
" color: #000000;\n",
"}\n",
"#T_b990e_row3_col1 {\n",
" background-color: #38b977;\n",
" color: #f1f1f1;\n",
"}\n",
"#T_b990e_row3_col2 {\n",
" background-color: #aa2395;\n",
" color: #f1f1f1;\n",
"}\n",
"#T_b990e_row4_col0 {\n",
" background-color: #5cc863;\n",
" color: #000000;\n",
"}\n",
"#T_b990e_row4_col1 {\n",
" background-color: #56c667;\n",
" color: #000000;\n",
"}\n",
"#T_b990e_row4_col2 {\n",
" background-color: #920fa3;\n",
" color: #f1f1f1;\n",
"}\n",
"#T_b990e_row5_col0, #T_b990e_row6_col0, #T_b990e_row7_col1 {\n",
" background-color: #a8db34;\n",
" color: #000000;\n",
"}\n",
"#T_b990e_row5_col1 {\n",
" background-color: #67cc5c;\n",
" color: #000000;\n",
"}\n",
"#T_b990e_row5_col2, #T_b990e_row6_col2 {\n",
" background-color: #8405a7;\n",
" color: #f1f1f1;\n",
"}\n",
"#T_b990e_row6_col1 {\n",
" background-color: #69cd5b;\n",
" color: #000000;\n",
"}\n",
"#T_b990e_row7_col0 {\n",
" background-color: #8bd646;\n",
" color: #000000;\n",
"}\n",
"#T_b990e_row7_col2 {\n",
" background-color: #4e02a2;\n",
" color: #f1f1f1;\n",
"}\n",
"</style>\n",
"<table id=\"T_b990e\">\n",
" <thead>\n",
" <tr>\n",
" <th class=\"blank level0\" >&nbsp;</th>\n",
" <th id=\"T_b990e_level0_col0\" class=\"col_heading level0 col0\" >RMSE_train</th>\n",
" <th id=\"T_b990e_level0_col1\" class=\"col_heading level0 col1\" >RMSE_test</th>\n",
" <th id=\"T_b990e_level0_col2\" class=\"col_heading level0 col2\" >R2_test</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th id=\"T_b990e_level0_row0\" class=\"row_heading level0 row0\" >mlp</th>\n",
" <td id=\"T_b990e_row0_col0\" class=\"data row0 col0\" >3.243516</td>\n",
" <td id=\"T_b990e_row0_col1\" class=\"data row0 col1\" >3.308597</td>\n",
" <td id=\"T_b990e_row0_col2\" class=\"data row0 col2\" >0.987025</td>\n",
" </tr>\n",
" <tr>\n",
" <th id=\"T_b990e_level0_row1\" class=\"row_heading level0 row1\" >random_forest</th>\n",
" <td id=\"T_b990e_row1_col0\" class=\"data row1 col0\" >5.143267</td>\n",
" <td id=\"T_b990e_row1_col1\" class=\"data row1 col1\" >6.088249</td>\n",
" <td id=\"T_b990e_row1_col2\" class=\"data row1 col2\" >0.956065</td>\n",
" </tr>\n",
" <tr>\n",
" <th id=\"T_b990e_level0_row2\" class=\"row_heading level0 row2\" >linear_poly</th>\n",
" <td id=\"T_b990e_row2_col0\" class=\"data row2 col0\" >6.214010</td>\n",
" <td id=\"T_b990e_row2_col1\" class=\"data row2 col1\" >6.339843</td>\n",
" <td id=\"T_b990e_row2_col2\" class=\"data row2 col2\" >0.952359</td>\n",
" </tr>\n",
" <tr>\n",
" <th id=\"T_b990e_level0_row3\" class=\"row_heading level0 row3\" >linear_interact</th>\n",
" <td id=\"T_b990e_row3_col0\" class=\"data row3 col0\" >6.454314</td>\n",
" <td id=\"T_b990e_row3_col1\" class=\"data row3 col1\" >6.562284</td>\n",
" <td id=\"T_b990e_row3_col2\" class=\"data row3 col2\" >0.948957</td>\n",
" </tr>\n",
" <tr>\n",
" <th id=\"T_b990e_level0_row4\" class=\"row_heading level0 row4\" >decision_tree</th>\n",
" <td id=\"T_b990e_row4_col0\" class=\"data row4 col0\" >6.542924</td>\n",
" <td id=\"T_b990e_row4_col1\" class=\"data row4 col1\" >7.456335</td>\n",
" <td id=\"T_b990e_row4_col2\" class=\"data row4 col2\" >0.934102</td>\n",
" </tr>\n",
" <tr>\n",
" <th id=\"T_b990e_level0_row5\" class=\"row_heading level0 row5\" >linear</th>\n",
" <td id=\"T_b990e_row5_col0\" class=\"data row5 col0\" >7.819643</td>\n",
" <td id=\"T_b990e_row5_col1\" class=\"data row5 col1\" >7.882875</td>\n",
" <td id=\"T_b990e_row5_col2\" class=\"data row5 col2\" >0.926347</td>\n",
" </tr>\n",
" <tr>\n",
" <th id=\"T_b990e_level0_row6\" class=\"row_heading level0 row6\" >ridge</th>\n",
" <td id=\"T_b990e_row6_col0\" class=\"data row6 col0\" >7.832066</td>\n",
" <td id=\"T_b990e_row6_col1\" class=\"data row6 col1\" >7.898189</td>\n",
" <td id=\"T_b990e_row6_col2\" class=\"data row6 col2\" >0.926060</td>\n",
" </tr>\n",
" <tr>\n",
" <th id=\"T_b990e_level0_row7\" class=\"row_heading level0 row7\" >knn</th>\n",
" <td id=\"T_b990e_row7_col0\" class=\"data row7 col0\" >7.360098</td>\n",
" <td id=\"T_b990e_row7_col1\" class=\"data row7 col1\" >9.259422</td>\n",
" <td id=\"T_b990e_row7_col2\" class=\"data row7 col2\" >0.898377</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n"
],
"text/plain": [
"<pandas.io.formats.style.Styler at 0x31847a090>"
]
},
"execution_count": 28,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"reg_metrics = pd.DataFrame.from_dict(reg_models, \"index\")[\n",
" [\"RMSE_train\", \"RMSE_test\", \"R2_test\"]\n",
"]\n",
"reg_metrics.sort_values(by=\"RMSE_test\").style.background_gradient(\n",
" cmap=\"viridis\", low=1, high=0.3, subset=[\"RMSE_train\", \"RMSE_test\"]\n",
").background_gradient(cmap=\"plasma\", low=0.3, high=1, subset=[\"R2_test\"])"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "MZrTDbnjJMrB"
},
"source": [
"#### Использование классификаторов для предсказания задержки рейса"
]
},
{
"cell_type": "code",
"execution_count": 29,
"metadata": {
"id": "c_1jMq5IJMSL"
},
"outputs": [],
"source": [
"from sklearn import naive_bayes\n",
"\n",
"class_models = {\n",
" # Обобщенные линейные модели (GLM-модели)\n",
" \"logistic\": {\"model\": linear_model.LogisticRegression()},\n",
" \"ridge\": {\n",
" \"model\": linear_model.LogisticRegression(penalty=\"l2\", class_weight=\"balanced\")\n",
" },\n",
" # Дерево\n",
" \"decision_tree\": {\n",
" \"model\": tree.DecisionTreeClassifier(max_depth=7, random_state=rand)\n",
" },\n",
" # Ближайшие соседи\n",
" \"knn\": {\"model\": neighbors.KNeighborsClassifier(n_neighbors=7)},\n",
" # Наивный Байес\n",
" \"naive_bayes\": {\"model\": naive_bayes.GaussianNB()},\n",
" # Ансамблевые методы\n",
" \"gradient_boosting\": {\n",
" \"model\": ensemble.GradientBoostingClassifier(n_estimators=210)\n",
" },\n",
" \"random_forest\": {\n",
" \"model\": ensemble.RandomForestClassifier(\n",
" max_depth=11, class_weight=\"balanced\", random_state=rand\n",
" )\n",
" },\n",
" # Нейронные сети\n",
" \"mlp\": {\n",
" \"model\": make_pipeline(\n",
" StandardScaler(),\n",
" neural_network.MLPClassifier(\n",
" hidden_layer_sizes=(7,),\n",
" max_iter=500,\n",
" early_stopping=True,\n",
" random_state=rand,\n",
" ),\n",
" memory=None\n",
" )\n",
" },\n",
"}"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "rMzuL4OxKAns"
},
"source": [
"#### Определение сбалансированности выборки для классификации"
]
},
{
"cell_type": "code",
"execution_count": 30,
"metadata": {
"id": "BOVmTCPHJ5Au"
},
"outputs": [
{
"data": {
"text/plain": [
"0.061283264255549"
]
},
"execution_count": 30,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"y_train_class[y_train_class == 1].shape[0] / y_train_class.shape[0]"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "H8Z7-KugJ7xR"
},
"source": [
"#### Обучение и оценка классификаторов"
]
},
{
"cell_type": "code",
"execution_count": 31,
"metadata": {
"id": "RyR1_5m4KBTs"
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Model: logistic\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"/Users/user/Projects/python/ckexp/.venv/lib/python3.11/site-packages/sklearn/linear_model/_logistic.py:465: ConvergenceWarning: lbfgs failed to converge (status=1):\n",
"STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.\n",
"\n",
"Increase the number of iterations (max_iter) or scale the data as shown in:\n",
" https://scikit-learn.org/stable/modules/preprocessing.html\n",
"Please also refer to the documentation for alternative solver options:\n",
" https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression\n",
" n_iter_i = _check_optimize_result(\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Model: ridge\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"/Users/user/Projects/python/ckexp/.venv/lib/python3.11/site-packages/sklearn/linear_model/_logistic.py:465: ConvergenceWarning: lbfgs failed to converge (status=1):\n",
"STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.\n",
"\n",
"Increase the number of iterations (max_iter) or scale the data as shown in:\n",
" https://scikit-learn.org/stable/modules/preprocessing.html\n",
"Please also refer to the documentation for alternative solver options:\n",
" https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression\n",
" n_iter_i = _check_optimize_result(\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Model: decision_tree\n",
"Model: knn\n",
"Model: naive_bayes\n",
"Model: gradient_boosting\n",
"Model: random_forest\n",
"Model: mlp\n"
]
}
],
"source": [
"import numpy as np\n",
"\n",
"for model_name in class_models.keys():\n",
" print(f\"Model: {model_name}\")\n",
" fitted_model = class_models[model_name][\"model\"].fit(\n",
" X_train.values,\n",
" y_train_class.to_numpy().ravel(),\n",
" )\n",
" y_train_pred = fitted_model.predict(X_train.values)\n",
" y_test_prob = fitted_model.predict_proba(X_test.values)[:, 1]\n",
" y_test_pred = fitted_model.predict(X_test.values)\n",
"\n",
" class_models[model_name][\"fitted\"] = fitted_model\n",
" class_models[model_name][\"probs\"] = y_test_prob\n",
" class_models[model_name][\"preds\"] = y_test_pred\n",
"\n",
" class_models[model_name][\"Accuracy_train\"] = metrics.accuracy_score(\n",
" y_train_class, y_train_pred\n",
" )\n",
" class_models[model_name][\"Accuracy_test\"] = metrics.accuracy_score(\n",
" y_test_class, y_test_pred\n",
" )\n",
" class_models[model_name][\"Recall_train\"] = metrics.recall_score(\n",
" y_train_class, y_train_pred\n",
" )\n",
" class_models[model_name][\"Recall_test\"] = metrics.recall_score(\n",
" y_test_class, y_test_pred\n",
" )\n",
" class_models[model_name][\"ROC_AUC_test\"] = metrics.roc_auc_score(\n",
" y_test_class, y_test_prob\n",
" )\n",
" class_models[model_name][\"F1_test\"] = metrics.f1_score(y_test_class, y_test_pred)\n",
" class_models[model_name][\"MCC_test\"] = metrics.matthews_corrcoef(\n",
" y_test_class, y_test_pred\n",
" )"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "NzCyXXXFKdx2"
},
"source": [
"#### Вывод оценки в виде таблицы"
]
},
{
"cell_type": "code",
"execution_count": 32,
"metadata": {
"id": "VOhaFiEYKeN5"
},
"outputs": [
{
"data": {
"text/html": [
"<style type=\"text/css\">\n",
"#T_93e7c_row0_col0, #T_93e7c_row0_col1 {\n",
" background-color: #da5a6a;\n",
" color: #f1f1f1;\n",
"}\n",
"#T_93e7c_row0_col2 {\n",
" background-color: #a2da37;\n",
" color: #000000;\n",
"}\n",
"#T_93e7c_row0_col3, #T_93e7c_row0_col4, #T_93e7c_row0_col5, #T_93e7c_row0_col6, #T_93e7c_row1_col4, #T_93e7c_row2_col2, #T_93e7c_row2_col3 {\n",
" background-color: #a8db34;\n",
" color: #000000;\n",
"}\n",
"#T_93e7c_row1_col0, #T_93e7c_row1_col1 {\n",
" background-color: #d14e72;\n",
" color: #f1f1f1;\n",
"}\n",
"#T_93e7c_row1_col2 {\n",
" background-color: #7fd34e;\n",
" color: #000000;\n",
"}\n",
"#T_93e7c_row1_col3 {\n",
" background-color: #81d34d;\n",
" color: #000000;\n",
"}\n",
"#T_93e7c_row1_col5, #T_93e7c_row1_col6 {\n",
" background-color: #8ed645;\n",
" color: #000000;\n",
"}\n",
"#T_93e7c_row2_col0 {\n",
" background-color: #7401a8;\n",
" color: #f1f1f1;\n",
"}\n",
"#T_93e7c_row2_col1 {\n",
" background-color: #7100a8;\n",
" color: #f1f1f1;\n",
"}\n",
"#T_93e7c_row2_col4, #T_93e7c_row3_col4 {\n",
" background-color: #a0da39;\n",
" color: #000000;\n",
"}\n",
"#T_93e7c_row2_col5 {\n",
" background-color: #35b779;\n",
" color: #f1f1f1;\n",
"}\n",
"#T_93e7c_row2_col6 {\n",
" background-color: #3dbc74;\n",
" color: #f1f1f1;\n",
"}\n",
"#T_93e7c_row3_col0 {\n",
" background-color: #c5407e;\n",
" color: #f1f1f1;\n",
"}\n",
"#T_93e7c_row3_col1 {\n",
" background-color: #c43e7f;\n",
" color: #f1f1f1;\n",
"}\n",
"#T_93e7c_row3_col2, #T_93e7c_row3_col3 {\n",
" background-color: #70cf57;\n",
" color: #000000;\n",
"}\n",
"#T_93e7c_row3_col5, #T_93e7c_row3_col6 {\n",
" background-color: #73d056;\n",
" color: #000000;\n",
"}\n",
"#T_93e7c_row4_col0 {\n",
" background-color: #7a02a8;\n",
" color: #f1f1f1;\n",
"}\n",
"#T_93e7c_row4_col1 {\n",
" background-color: #7701a8;\n",
" color: #f1f1f1;\n",
"}\n",
"#T_93e7c_row4_col2, #T_93e7c_row4_col3 {\n",
" background-color: #93d741;\n",
" color: #000000;\n",
"}\n",
"#T_93e7c_row4_col4 {\n",
" background-color: #90d743;\n",
" color: #000000;\n",
"}\n",
"#T_93e7c_row4_col5 {\n",
" background-color: #34b679;\n",
" color: #f1f1f1;\n",
"}\n",
"#T_93e7c_row4_col6, #T_93e7c_row5_col2, #T_93e7c_row6_col2 {\n",
" background-color: #3aba76;\n",
" color: #f1f1f1;\n",
"}\n",
"#T_93e7c_row5_col0 {\n",
" background-color: #b83289;\n",
" color: #f1f1f1;\n",
"}\n",
"#T_93e7c_row5_col1 {\n",
" background-color: #b7318a;\n",
" color: #f1f1f1;\n",
"}\n",
"#T_93e7c_row5_col3 {\n",
" background-color: #3bbb75;\n",
" color: #f1f1f1;\n",
"}\n",
"#T_93e7c_row5_col4 {\n",
" background-color: #6ece58;\n",
" color: #000000;\n",
"}\n",
"#T_93e7c_row5_col5 {\n",
" background-color: #52c569;\n",
" color: #000000;\n",
"}\n",
"#T_93e7c_row5_col6 {\n",
" background-color: #54c568;\n",
" color: #000000;\n",
"}\n",
"#T_93e7c_row6_col0 {\n",
" background-color: #b42e8d;\n",
" color: #f1f1f1;\n",
"}\n",
"#T_93e7c_row6_col1 {\n",
" background-color: #a62098;\n",
" color: #f1f1f1;\n",
"}\n",
"#T_93e7c_row6_col3 {\n",
" background-color: #2ab07f;\n",
" color: #f1f1f1;\n",
"}\n",
"#T_93e7c_row6_col4 {\n",
" background-color: #5ec962;\n",
" color: #000000;\n",
"}\n",
"#T_93e7c_row6_col5, #T_93e7c_row6_col6 {\n",
" background-color: #38b977;\n",
" color: #f1f1f1;\n",
"}\n",
"#T_93e7c_row7_col0, #T_93e7c_row7_col1 {\n",
" background-color: #4e02a2;\n",
" color: #f1f1f1;\n",
"}\n",
"#T_93e7c_row7_col2, #T_93e7c_row7_col3, #T_93e7c_row7_col4, #T_93e7c_row7_col5, #T_93e7c_row7_col6 {\n",
" background-color: #26818e;\n",
" color: #f1f1f1;\n",
"}\n",
"</style>\n",
"<table id=\"T_93e7c\">\n",
" <thead>\n",
" <tr>\n",
" <th class=\"blank level0\" >&nbsp;</th>\n",
" <th id=\"T_93e7c_level0_col0\" class=\"col_heading level0 col0\" >Accuracy_train</th>\n",
" <th id=\"T_93e7c_level0_col1\" class=\"col_heading level0 col1\" >Accuracy_test</th>\n",
" <th id=\"T_93e7c_level0_col2\" class=\"col_heading level0 col2\" >Recall_train</th>\n",
" <th id=\"T_93e7c_level0_col3\" class=\"col_heading level0 col3\" >Recall_test</th>\n",
" <th id=\"T_93e7c_level0_col4\" class=\"col_heading level0 col4\" >ROC_AUC_test</th>\n",
" <th id=\"T_93e7c_level0_col5\" class=\"col_heading level0 col5\" >F1_test</th>\n",
" <th id=\"T_93e7c_level0_col6\" class=\"col_heading level0 col6\" >MCC_test</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th id=\"T_93e7c_level0_row0\" class=\"row_heading level0 row0\" >mlp</th>\n",
" <td id=\"T_93e7c_row0_col0\" class=\"data row0 col0\" >0.998482</td>\n",
" <td id=\"T_93e7c_row0_col1\" class=\"data row0 col1\" >0.998555</td>\n",
" <td id=\"T_93e7c_row0_col2\" class=\"data row0 col2\" >0.987131</td>\n",
" <td id=\"T_93e7c_row0_col3\" class=\"data row0 col3\" >0.988865</td>\n",
" <td id=\"T_93e7c_row0_col4\" class=\"data row0 col4\" >0.999877</td>\n",
" <td id=\"T_93e7c_row0_col5\" class=\"data row0 col5\" >0.988207</td>\n",
" <td id=\"T_93e7c_row0_col6\" class=\"data row0 col6\" >0.987437</td>\n",
" </tr>\n",
" <tr>\n",
" <th id=\"T_93e7c_level0_row1\" class=\"row_heading level0 row1\" >gradient_boosting</th>\n",
" <td id=\"T_93e7c_row1_col0\" class=\"data row1 col0\" >0.991725</td>\n",
" <td id=\"T_93e7c_row1_col1\" class=\"data row1 col1\" >0.991662</td>\n",
" <td id=\"T_93e7c_row1_col2\" class=\"data row1 col2\" >0.892930</td>\n",
" <td id=\"T_93e7c_row1_col3\" class=\"data row1 col3\" >0.893851</td>\n",
" <td id=\"T_93e7c_row1_col4\" class=\"data row1 col4\" >0.998885</td>\n",
" <td id=\"T_93e7c_row1_col5\" class=\"data row1 col5\" >0.929223</td>\n",
" <td id=\"T_93e7c_row1_col6\" class=\"data row1 col6\" >0.925619</td>\n",
" </tr>\n",
" <tr>\n",
" <th id=\"T_93e7c_level0_row2\" class=\"row_heading level0 row2\" >random_forest</th>\n",
" <td id=\"T_93e7c_row2_col0\" class=\"data row2 col0\" >0.941166</td>\n",
" <td id=\"T_93e7c_row2_col1\" class=\"data row2 col1\" >0.940325</td>\n",
" <td id=\"T_93e7c_row2_col2\" class=\"data row2 col2\" >0.999552</td>\n",
" <td id=\"T_93e7c_row2_col3\" class=\"data row2 col3\" >0.992375</td>\n",
" <td id=\"T_93e7c_row2_col4\" class=\"data row2 col4\" >0.995145</td>\n",
" <td id=\"T_93e7c_row2_col5\" class=\"data row2 col5\" >0.670675</td>\n",
" <td id=\"T_93e7c_row2_col6\" class=\"data row2 col6\" >0.685702</td>\n",
" </tr>\n",
" <tr>\n",
" <th id=\"T_93e7c_level0_row3\" class=\"row_heading level0 row3\" >decision_tree</th>\n",
" <td id=\"T_93e7c_row3_col0\" class=\"data row3 col0\" >0.983297</td>\n",
" <td id=\"T_93e7c_row3_col1\" class=\"data row3 col1\" >0.982895</td>\n",
" <td id=\"T_93e7c_row3_col2\" class=\"data row3 col2\" >0.856969</td>\n",
" <td id=\"T_93e7c_row3_col3\" class=\"data row3 col3\" >0.852215</td>\n",
" <td id=\"T_93e7c_row3_col4\" class=\"data row3 col4\" >0.994932</td>\n",
" <td id=\"T_93e7c_row3_col5\" class=\"data row3 col5\" >0.859182</td>\n",
" <td id=\"T_93e7c_row3_col6\" class=\"data row3 col6\" >0.850110</td>\n",
" </tr>\n",
" <tr>\n",
" <th id=\"T_93e7c_level0_row4\" class=\"row_heading level0 row4\" >ridge</th>\n",
" <td id=\"T_93e7c_row4_col0\" class=\"data row4 col0\" >0.943453</td>\n",
" <td id=\"T_93e7c_row4_col1\" class=\"data row4 col1\" >0.942526</td>\n",
" <td id=\"T_93e7c_row4_col2\" class=\"data row4 col2\" >0.945579</td>\n",
" <td id=\"T_93e7c_row4_col3\" class=\"data row4 col3\" >0.940934</td>\n",
" <td id=\"T_93e7c_row4_col4\" class=\"data row4 col4\" >0.983777</td>\n",
" <td id=\"T_93e7c_row4_col5\" class=\"data row4 col5\" >0.667210</td>\n",
" <td id=\"T_93e7c_row4_col6\" class=\"data row4 col6\" >0.673110</td>\n",
" </tr>\n",
" <tr>\n",
" <th id=\"T_93e7c_level0_row5\" class=\"row_heading level0 row5\" >logistic</th>\n",
" <td id=\"T_93e7c_row5_col0\" class=\"data row5 col0\" >0.975054</td>\n",
" <td id=\"T_93e7c_row5_col1\" class=\"data row5 col1\" >0.975031</td>\n",
" <td id=\"T_93e7c_row5_col2\" class=\"data row5 col2\" >0.683292</td>\n",
" <td id=\"T_93e7c_row5_col3\" class=\"data row5 col3\" >0.680828</td>\n",
" <td id=\"T_93e7c_row5_col4\" class=\"data row5 col4\" >0.960287</td>\n",
" <td id=\"T_93e7c_row5_col5\" class=\"data row5 col5\" >0.769546</td>\n",
" <td id=\"T_93e7c_row5_col6\" class=\"data row5 col6\" >0.763854</td>\n",
" </tr>\n",
" <tr>\n",
" <th id=\"T_93e7c_level0_row6\" class=\"row_heading level0 row6\" >knn</th>\n",
" <td id=\"T_93e7c_row6_col0\" class=\"data row6 col0\" >0.972886</td>\n",
" <td id=\"T_93e7c_row6_col1\" class=\"data row6 col1\" >0.965123</td>\n",
" <td id=\"T_93e7c_row6_col2\" class=\"data row6 col2\" >0.680645</td>\n",
" <td id=\"T_93e7c_row6_col3\" class=\"data row6 col3\" >0.607722</td>\n",
" <td id=\"T_93e7c_row6_col4\" class=\"data row6 col4\" >0.948387</td>\n",
" <td id=\"T_93e7c_row6_col5\" class=\"data row6 col5\" >0.680906</td>\n",
" <td id=\"T_93e7c_row6_col6\" class=\"data row6 col6\" >0.668176</td>\n",
" </tr>\n",
" <tr>\n",
" <th id=\"T_93e7c_level0_row7\" class=\"row_heading level0 row7\" >naive_bayes</th>\n",
" <td id=\"T_93e7c_row7_col0\" class=\"data row7 col0\" >0.925119</td>\n",
" <td id=\"T_93e7c_row7_col1\" class=\"data row7 col1\" >0.925539</td>\n",
" <td id=\"T_93e7c_row7_col2\" class=\"data row7 col2\" >0.279126</td>\n",
" <td id=\"T_93e7c_row7_col3\" class=\"data row7 col3\" >0.274268</td>\n",
" <td id=\"T_93e7c_row7_col4\" class=\"data row7 col4\" >0.811869</td>\n",
" <td id=\"T_93e7c_row7_col5\" class=\"data row7 col5\" >0.310858</td>\n",
" <td id=\"T_93e7c_row7_col6\" class=\"data row7 col6\" >0.274984</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n"
],
"text/plain": [
"<pandas.io.formats.style.Styler at 0x32ad627d0>"
]
},
"execution_count": 32,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"class_metrics = pd.DataFrame.from_dict(class_models, \"index\")[\n",
" [\n",
" \"Accuracy_train\",\n",
" \"Accuracy_test\",\n",
" \"Recall_train\",\n",
" \"Recall_test\",\n",
" \"ROC_AUC_test\",\n",
" \"F1_test\",\n",
" \"MCC_test\",\n",
" ]\n",
"]\n",
"class_metrics.sort_values(by=\"ROC_AUC_test\", ascending=False).style.background_gradient(\n",
" cmap=\"plasma\", low=0.3, high=1, subset=[\"Accuracy_train\", \"Accuracy_test\"]\n",
").background_gradient(\n",
" cmap=\"viridis\",\n",
" low=1,\n",
" high=0.3,\n",
" subset=[\"Recall_train\", \"Recall_test\", \"ROC_AUC_test\", \"F1_test\", \"MCC_test\"],\n",
")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Интерпретация результатов для моделей на основе \"белого ящика\""
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Линейная регрессия"
]
},
{
"cell_type": "code",
"execution_count": 33,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"coefficients:\t[ 4.54984539e-03 -5.25067742e-03 8.94125541e-01 -1.52961053e-02\n",
" -4.69623002e-01 1.25277815e-01 -6.46744472e-04 -1.26240049e-02\n",
" 4.50112895e+01 6.76385421e-04 -3.69920254e-04 5.47855860e-04\n",
" 3.73866548e-01 -9.06364154e-01 -6.74052666e-01 -9.17411191e-01\n",
" -9.29843952e-01 -3.96621856e-02 -1.79666480e-02 -1.02912927e+00\n",
" -3.94934854e-01]\n",
"intercept:\t-37.86177932752649\n",
"y = -37.86 + 0.0045X1 + -0.0053X2 + 0.894X3 + ...\n"
]
}
],
"source": [
"coefs_lm = reg_models[\"linear\"][\"fitted\"].coef_\n",
"intercept_lm = reg_models[\"linear\"][\"fitted\"].intercept_\n",
"print(\"coefficients:\\t%s\" % coefs_lm)\n",
"print(\"intercept:\\t%s\" % intercept_lm)\n",
"print(\n",
" \"y = %0.2f + %0.4fX1 + %0.4fX2 + %0.3fX3 + ...\"\n",
" % (intercept_lm, coefs_lm[0], coefs_lm[1], coefs_lm[2])\n",
")"
]
},
{
"cell_type": "code",
"execution_count": 34,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>feature</th>\n",
" <th>coef</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>CRS_DEP_TIME</td>\n",
" <td>0.004550</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>DEP_TIME</td>\n",
" <td>-0.005251</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>DEP_DELAY</td>\n",
" <td>0.894126</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>DEP_AFPH</td>\n",
" <td>-0.015296</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>DEP_RFPH</td>\n",
" <td>-0.469623</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5</th>\n",
" <td>TAXI_OUT</td>\n",
" <td>0.125278</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6</th>\n",
" <td>WHEELS_OFF</td>\n",
" <td>-0.000647</td>\n",
" </tr>\n",
" <tr>\n",
" <th>7</th>\n",
" <td>CRS_ELAPSED_TIME</td>\n",
" <td>-0.012624</td>\n",
" </tr>\n",
" <tr>\n",
" <th>8</th>\n",
" <td>PCT_ELAPSED_TIME</td>\n",
" <td>45.011289</td>\n",
" </tr>\n",
" <tr>\n",
" <th>9</th>\n",
" <td>DISTANCE</td>\n",
" <td>0.000676</td>\n",
" </tr>\n",
" <tr>\n",
" <th>10</th>\n",
" <td>CRS_ARR_TIME</td>\n",
" <td>-0.000370</td>\n",
" </tr>\n",
" <tr>\n",
" <th>11</th>\n",
" <td>ARR_AFPH</td>\n",
" <td>0.000548</td>\n",
" </tr>\n",
" <tr>\n",
" <th>12</th>\n",
" <td>ARR_RFPH</td>\n",
" <td>0.373867</td>\n",
" </tr>\n",
" <tr>\n",
" <th>13</th>\n",
" <td>WEATHER_DELAY</td>\n",
" <td>-0.906364</td>\n",
" </tr>\n",
" <tr>\n",
" <th>14</th>\n",
" <td>NAS_DELAY</td>\n",
" <td>-0.674053</td>\n",
" </tr>\n",
" <tr>\n",
" <th>15</th>\n",
" <td>SECURITY_DELAY</td>\n",
" <td>-0.917411</td>\n",
" </tr>\n",
" <tr>\n",
" <th>16</th>\n",
" <td>LATE_AIRCRAFT_DELAY</td>\n",
" <td>-0.929844</td>\n",
" </tr>\n",
" <tr>\n",
" <th>17</th>\n",
" <td>DEP_MONTH</td>\n",
" <td>-0.039662</td>\n",
" </tr>\n",
" <tr>\n",
" <th>18</th>\n",
" <td>DEP_DOW</td>\n",
" <td>-0.017967</td>\n",
" </tr>\n",
" <tr>\n",
" <th>19</th>\n",
" <td>ORIGIN_HUB</td>\n",
" <td>-1.029129</td>\n",
" </tr>\n",
" <tr>\n",
" <th>20</th>\n",
" <td>DEST_HUB</td>\n",
" <td>-0.394935</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" feature coef\n",
"0 CRS_DEP_TIME 0.004550\n",
"1 DEP_TIME -0.005251\n",
"2 DEP_DELAY 0.894126\n",
"3 DEP_AFPH -0.015296\n",
"4 DEP_RFPH -0.469623\n",
"5 TAXI_OUT 0.125278\n",
"6 WHEELS_OFF -0.000647\n",
"7 CRS_ELAPSED_TIME -0.012624\n",
"8 PCT_ELAPSED_TIME 45.011289\n",
"9 DISTANCE 0.000676\n",
"10 CRS_ARR_TIME -0.000370\n",
"11 ARR_AFPH 0.000548\n",
"12 ARR_RFPH 0.373867\n",
"13 WEATHER_DELAY -0.906364\n",
"14 NAS_DELAY -0.674053\n",
"15 SECURITY_DELAY -0.917411\n",
"16 LATE_AIRCRAFT_DELAY -0.929844\n",
"17 DEP_MONTH -0.039662\n",
"18 DEP_DOW -0.017967\n",
"19 ORIGIN_HUB -1.029129\n",
"20 DEST_HUB -0.394935"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"coef_df = pd.DataFrame({\"feature\": X_train.columns.values.tolist(), \"coef\": coefs_lm})\n",
"display(coef_df)"
]
},
{
"cell_type": "code",
"execution_count": 35,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<style type=\"text/css\">\n",
"#T_be19c_row0_col4, #T_be19c_row1_col4, #T_be19c_row2_col4, #T_be19c_row3_col4, #T_be19c_row4_col4, #T_be19c_row5_col4, #T_be19c_row6_col4, #T_be19c_row7_col4, #T_be19c_row8_col4, #T_be19c_row9_col4, #T_be19c_row10_col4, #T_be19c_row11_col4, #T_be19c_row12_col4, #T_be19c_row13_col4, #T_be19c_row14_col4, #T_be19c_row15_col4, #T_be19c_row16_col4, #T_be19c_row17_col4, #T_be19c_row17_col7, #T_be19c_row18_col4, #T_be19c_row18_col7, #T_be19c_row19_col4, #T_be19c_row19_col7, #T_be19c_row20_col7 {\n",
" background-color: #f0f921;\n",
" color: #000000;\n",
"}\n",
"#T_be19c_row0_col7, #T_be19c_row20_col4 {\n",
" background-color: #3e049c;\n",
" color: #f1f1f1;\n",
"}\n",
"#T_be19c_row1_col7 {\n",
" background-color: #bb3488;\n",
" color: #f1f1f1;\n",
"}\n",
"#T_be19c_row2_col7 {\n",
" background-color: #f1814d;\n",
" color: #f1f1f1;\n",
"}\n",
"#T_be19c_row3_col7 {\n",
" background-color: #f79342;\n",
" color: #000000;\n",
"}\n",
"#T_be19c_row4_col7 {\n",
" background-color: #fdc627;\n",
" color: #000000;\n",
"}\n",
"#T_be19c_row5_col7 {\n",
" background-color: #f7e225;\n",
" color: #000000;\n",
"}\n",
"#T_be19c_row6_col7 {\n",
" background-color: #f5eb27;\n",
" color: #000000;\n",
"}\n",
"#T_be19c_row7_col7, #T_be19c_row8_col7 {\n",
" background-color: #f2f227;\n",
" color: #000000;\n",
"}\n",
"#T_be19c_row9_col7 {\n",
" background-color: #f1f426;\n",
" color: #000000;\n",
"}\n",
"#T_be19c_row10_col7, #T_be19c_row11_col7, #T_be19c_row12_col7 {\n",
" background-color: #f1f525;\n",
" color: #000000;\n",
"}\n",
"#T_be19c_row13_col7, #T_be19c_row14_col7, #T_be19c_row15_col7, #T_be19c_row16_col7 {\n",
" background-color: #f0f724;\n",
" color: #000000;\n",
"}\n",
"</style>\n",
"<table id=\"T_be19c\">\n",
" <thead>\n",
" <tr>\n",
" <th class=\"blank level0\" >&nbsp;</th>\n",
" <th id=\"T_be19c_level0_col0\" class=\"col_heading level0 col0\" >feature</th>\n",
" <th id=\"T_be19c_level0_col1\" class=\"col_heading level0 col1\" >Coef.</th>\n",
" <th id=\"T_be19c_level0_col2\" class=\"col_heading level0 col2\" >Std.Err.</th>\n",
" <th id=\"T_be19c_level0_col3\" class=\"col_heading level0 col3\" >t</th>\n",
" <th id=\"T_be19c_level0_col4\" class=\"col_heading level0 col4\" >P>|t|</th>\n",
" <th id=\"T_be19c_level0_col5\" class=\"col_heading level0 col5\" >[0.025</th>\n",
" <th id=\"T_be19c_level0_col6\" class=\"col_heading level0 col6\" >0.975]</th>\n",
" <th id=\"T_be19c_level0_col7\" class=\"col_heading level0 col7\" >t_abs</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th id=\"T_be19c_level0_row0\" class=\"row_heading level0 row0\" >2</th>\n",
" <td id=\"T_be19c_row0_col0\" class=\"data row0 col0\" >DEP_DELAY</td>\n",
" <td id=\"T_be19c_row0_col1\" class=\"data row0 col1\" >0.894126</td>\n",
" <td id=\"T_be19c_row0_col2\" class=\"data row0 col2\" >0.000303</td>\n",
" <td id=\"T_be19c_row0_col3\" class=\"data row0 col3\" >2951.055978</td>\n",
" <td id=\"T_be19c_row0_col4\" class=\"data row0 col4\" >0.000000</td>\n",
" <td id=\"T_be19c_row0_col5\" class=\"data row0 col5\" >0.893532</td>\n",
" <td id=\"T_be19c_row0_col6\" class=\"data row0 col6\" >0.894719</td>\n",
" <td id=\"T_be19c_row0_col7\" class=\"data row0 col7\" >2951.055978</td>\n",
" </tr>\n",
" <tr>\n",
" <th id=\"T_be19c_level0_row1\" class=\"row_heading level0 row1\" >16</th>\n",
" <td id=\"T_be19c_row1_col0\" class=\"data row1 col0\" >LATE_AIRCRAFT_DELAY</td>\n",
" <td id=\"T_be19c_row1_col1\" class=\"data row1 col1\" >-0.929844</td>\n",
" <td id=\"T_be19c_row1_col2\" class=\"data row1 col2\" >0.000509</td>\n",
" <td id=\"T_be19c_row1_col3\" class=\"data row1 col3\" >-1827.018082</td>\n",
" <td id=\"T_be19c_row1_col4\" class=\"data row1 col4\" >0.000000</td>\n",
" <td id=\"T_be19c_row1_col5\" class=\"data row1 col5\" >-0.930841</td>\n",
" <td id=\"T_be19c_row1_col6\" class=\"data row1 col6\" >-0.928846</td>\n",
" <td id=\"T_be19c_row1_col7\" class=\"data row1 col7\" >1827.018082</td>\n",
" </tr>\n",
" <tr>\n",
" <th id=\"T_be19c_level0_row2\" class=\"row_heading level0 row2\" >13</th>\n",
" <td id=\"T_be19c_row2_col0\" class=\"data row2 col0\" >WEATHER_DELAY</td>\n",
" <td id=\"T_be19c_row2_col1\" class=\"data row2 col1\" >-0.906364</td>\n",
" <td id=\"T_be19c_row2_col2\" class=\"data row2 col2\" >0.000911</td>\n",
" <td id=\"T_be19c_row2_col3\" class=\"data row2 col3\" >-995.366423</td>\n",
" <td id=\"T_be19c_row2_col4\" class=\"data row2 col4\" >0.000000</td>\n",
" <td id=\"T_be19c_row2_col5\" class=\"data row2 col5\" >-0.908149</td>\n",
" <td id=\"T_be19c_row2_col6\" class=\"data row2 col6\" >-0.904579</td>\n",
" <td id=\"T_be19c_row2_col7\" class=\"data row2 col7\" >995.366423</td>\n",
" </tr>\n",
" <tr>\n",
" <th id=\"T_be19c_level0_row3\" class=\"row_heading level0 row3\" >14</th>\n",
" <td id=\"T_be19c_row3_col0\" class=\"data row3 col0\" >NAS_DELAY</td>\n",
" <td id=\"T_be19c_row3_col1\" class=\"data row3 col1\" >-0.674053</td>\n",
" <td id=\"T_be19c_row3_col2\" class=\"data row3 col2\" >0.000813</td>\n",
" <td id=\"T_be19c_row3_col3\" class=\"data row3 col3\" >-829.128657</td>\n",
" <td id=\"T_be19c_row3_col4\" class=\"data row3 col4\" >0.000000</td>\n",
" <td id=\"T_be19c_row3_col5\" class=\"data row3 col5\" >-0.675646</td>\n",
" <td id=\"T_be19c_row3_col6\" class=\"data row3 col6\" >-0.672459</td>\n",
" <td id=\"T_be19c_row3_col7\" class=\"data row3 col7\" >829.128657</td>\n",
" </tr>\n",
" <tr>\n",
" <th id=\"T_be19c_level0_row4\" class=\"row_heading level0 row4\" >8</th>\n",
" <td id=\"T_be19c_row4_col0\" class=\"data row4 col0\" >PCT_ELAPSED_TIME</td>\n",
" <td id=\"T_be19c_row4_col1\" class=\"data row4 col1\" >45.011289</td>\n",
" <td id=\"T_be19c_row4_col2\" class=\"data row4 col2\" >0.117195</td>\n",
" <td id=\"T_be19c_row4_col3\" class=\"data row4 col3\" >384.072566</td>\n",
" <td id=\"T_be19c_row4_col4\" class=\"data row4 col4\" >0.000000</td>\n",
" <td id=\"T_be19c_row4_col5\" class=\"data row4 col5\" >44.781592</td>\n",
" <td id=\"T_be19c_row4_col6\" class=\"data row4 col6\" >45.240987</td>\n",
" <td id=\"T_be19c_row4_col7\" class=\"data row4 col7\" >384.072566</td>\n",
" </tr>\n",
" <tr>\n",
" <th id=\"T_be19c_level0_row5\" class=\"row_heading level0 row5\" >15</th>\n",
" <td id=\"T_be19c_row5_col0\" class=\"data row5 col0\" >SECURITY_DELAY</td>\n",
" <td id=\"T_be19c_row5_col1\" class=\"data row5 col1\" >-0.917411</td>\n",
" <td id=\"T_be19c_row5_col2\" class=\"data row5 col2\" >0.005465</td>\n",
" <td id=\"T_be19c_row5_col3\" class=\"data row5 col3\" >-167.857085</td>\n",
" <td id=\"T_be19c_row5_col4\" class=\"data row5 col4\" >0.000000</td>\n",
" <td id=\"T_be19c_row5_col5\" class=\"data row5 col5\" >-0.928123</td>\n",
" <td id=\"T_be19c_row5_col6\" class=\"data row5 col6\" >-0.906699</td>\n",
" <td id=\"T_be19c_row5_col7\" class=\"data row5 col7\" >167.857085</td>\n",
" </tr>\n",
" <tr>\n",
" <th id=\"T_be19c_level0_row6\" class=\"row_heading level0 row6\" >5</th>\n",
" <td id=\"T_be19c_row6_col0\" class=\"data row6 col0\" >TAXI_OUT</td>\n",
" <td id=\"T_be19c_row6_col1\" class=\"data row6 col1\" >0.125278</td>\n",
" <td id=\"T_be19c_row6_col2\" class=\"data row6 col2\" >0.001203</td>\n",
" <td id=\"T_be19c_row6_col3\" class=\"data row6 col3\" >104.119579</td>\n",
" <td id=\"T_be19c_row6_col4\" class=\"data row6 col4\" >0.000000</td>\n",
" <td id=\"T_be19c_row6_col5\" class=\"data row6 col5\" >0.122920</td>\n",
" <td id=\"T_be19c_row6_col6\" class=\"data row6 col6\" >0.127636</td>\n",
" <td id=\"T_be19c_row6_col7\" class=\"data row6 col7\" >104.119579</td>\n",
" </tr>\n",
" <tr>\n",
" <th id=\"T_be19c_level0_row7\" class=\"row_heading level0 row7\" >0</th>\n",
" <td id=\"T_be19c_row7_col0\" class=\"data row7 col0\" >CRS_DEP_TIME</td>\n",
" <td id=\"T_be19c_row7_col1\" class=\"data row7 col1\" >0.004550</td>\n",
" <td id=\"T_be19c_row7_col2\" class=\"data row7 col2\" >0.000072</td>\n",
" <td id=\"T_be19c_row7_col3\" class=\"data row7 col3\" >62.871693</td>\n",
" <td id=\"T_be19c_row7_col4\" class=\"data row7 col4\" >0.000000</td>\n",
" <td id=\"T_be19c_row7_col5\" class=\"data row7 col5\" >0.004408</td>\n",
" <td id=\"T_be19c_row7_col6\" class=\"data row7 col6\" >0.004692</td>\n",
" <td id=\"T_be19c_row7_col7\" class=\"data row7 col7\" >62.871693</td>\n",
" </tr>\n",
" <tr>\n",
" <th id=\"T_be19c_level0_row8\" class=\"row_heading level0 row8\" >1</th>\n",
" <td id=\"T_be19c_row8_col0\" class=\"data row8 col0\" >DEP_TIME</td>\n",
" <td id=\"T_be19c_row8_col1\" class=\"data row8 col1\" >-0.005251</td>\n",
" <td id=\"T_be19c_row8_col2\" class=\"data row8 col2\" >0.000092</td>\n",
" <td id=\"T_be19c_row8_col3\" class=\"data row8 col3\" >-57.115895</td>\n",
" <td id=\"T_be19c_row8_col4\" class=\"data row8 col4\" >0.000000</td>\n",
" <td id=\"T_be19c_row8_col5\" class=\"data row8 col5\" >-0.005431</td>\n",
" <td id=\"T_be19c_row8_col6\" class=\"data row8 col6\" >-0.005070</td>\n",
" <td id=\"T_be19c_row8_col7\" class=\"data row8 col7\" >57.115895</td>\n",
" </tr>\n",
" <tr>\n",
" <th id=\"T_be19c_level0_row9\" class=\"row_heading level0 row9\" >3</th>\n",
" <td id=\"T_be19c_row9_col0\" class=\"data row9 col0\" >DEP_AFPH</td>\n",
" <td id=\"T_be19c_row9_col1\" class=\"data row9 col1\" >-0.015296</td>\n",
" <td id=\"T_be19c_row9_col2\" class=\"data row9 col2\" >0.000321</td>\n",
" <td id=\"T_be19c_row9_col3\" class=\"data row9 col3\" >-47.724506</td>\n",
" <td id=\"T_be19c_row9_col4\" class=\"data row9 col4\" >0.000000</td>\n",
" <td id=\"T_be19c_row9_col5\" class=\"data row9 col5\" >-0.015924</td>\n",
" <td id=\"T_be19c_row9_col6\" class=\"data row9 col6\" >-0.014668</td>\n",
" <td id=\"T_be19c_row9_col7\" class=\"data row9 col7\" >47.724506</td>\n",
" </tr>\n",
" <tr>\n",
" <th id=\"T_be19c_level0_row10\" class=\"row_heading level0 row10\" >19</th>\n",
" <td id=\"T_be19c_row10_col0\" class=\"data row10 col0\" >ORIGIN_HUB</td>\n",
" <td id=\"T_be19c_row10_col1\" class=\"data row10 col1\" >-1.029129</td>\n",
" <td id=\"T_be19c_row10_col2\" class=\"data row10 col2\" >0.026669</td>\n",
" <td id=\"T_be19c_row10_col3\" class=\"data row10 col3\" >-38.589411</td>\n",
" <td id=\"T_be19c_row10_col4\" class=\"data row10 col4\" >0.000000</td>\n",
" <td id=\"T_be19c_row10_col5\" class=\"data row10 col5\" >-1.081399</td>\n",
" <td id=\"T_be19c_row10_col6\" class=\"data row10 col6\" >-0.976860</td>\n",
" <td id=\"T_be19c_row10_col7\" class=\"data row10 col7\" >38.589411</td>\n",
" </tr>\n",
" <tr>\n",
" <th id=\"T_be19c_level0_row11\" class=\"row_heading level0 row11\" >12</th>\n",
" <td id=\"T_be19c_row11_col0\" class=\"data row11 col0\" >ARR_RFPH</td>\n",
" <td id=\"T_be19c_row11_col1\" class=\"data row11 col1\" >0.373867</td>\n",
" <td id=\"T_be19c_row11_col2\" class=\"data row11 col2\" >0.013171</td>\n",
" <td id=\"T_be19c_row11_col3\" class=\"data row11 col3\" >28.386031</td>\n",
" <td id=\"T_be19c_row11_col4\" class=\"data row11 col4\" >0.000000</td>\n",
" <td id=\"T_be19c_row11_col5\" class=\"data row11 col5\" >0.348052</td>\n",
" <td id=\"T_be19c_row11_col6\" class=\"data row11 col6\" >0.399681</td>\n",
" <td id=\"T_be19c_row11_col7\" class=\"data row11 col7\" >28.386031</td>\n",
" </tr>\n",
" <tr>\n",
" <th id=\"T_be19c_level0_row12\" class=\"row_heading level0 row12\" >4</th>\n",
" <td id=\"T_be19c_row12_col0\" class=\"data row12 col0\" >DEP_RFPH</td>\n",
" <td id=\"T_be19c_row12_col1\" class=\"data row12 col1\" >-0.469623</td>\n",
" <td id=\"T_be19c_row12_col2\" class=\"data row12 col2\" >0.017169</td>\n",
" <td id=\"T_be19c_row12_col3\" class=\"data row12 col3\" >-27.353179</td>\n",
" <td id=\"T_be19c_row12_col4\" class=\"data row12 col4\" >0.000000</td>\n",
" <td id=\"T_be19c_row12_col5\" class=\"data row12 col5\" >-0.503273</td>\n",
" <td id=\"T_be19c_row12_col6\" class=\"data row12 col6\" >-0.435973</td>\n",
" <td id=\"T_be19c_row12_col7\" class=\"data row12 col7\" >27.353179</td>\n",
" </tr>\n",
" <tr>\n",
" <th id=\"T_be19c_level0_row13\" class=\"row_heading level0 row13\" >7</th>\n",
" <td id=\"T_be19c_row13_col0\" class=\"data row13 col0\" >CRS_ELAPSED_TIME</td>\n",
" <td id=\"T_be19c_row13_col1\" class=\"data row13 col1\" >-0.012624</td>\n",
" <td id=\"T_be19c_row13_col2\" class=\"data row13 col2\" >0.000660</td>\n",
" <td id=\"T_be19c_row13_col3\" class=\"data row13 col3\" >-19.131516</td>\n",
" <td id=\"T_be19c_row13_col4\" class=\"data row13 col4\" >0.000000</td>\n",
" <td id=\"T_be19c_row13_col5\" class=\"data row13 col5\" >-0.013917</td>\n",
" <td id=\"T_be19c_row13_col6\" class=\"data row13 col6\" >-0.011331</td>\n",
" <td id=\"T_be19c_row13_col7\" class=\"data row13 col7\" >19.131516</td>\n",
" </tr>\n",
" <tr>\n",
" <th id=\"T_be19c_level0_row14\" class=\"row_heading level0 row14\" >10</th>\n",
" <td id=\"T_be19c_row14_col0\" class=\"data row14 col0\" >CRS_ARR_TIME</td>\n",
" <td id=\"T_be19c_row14_col1\" class=\"data row14 col1\" >-0.000370</td>\n",
" <td id=\"T_be19c_row14_col2\" class=\"data row14 col2\" >0.000022</td>\n",
" <td id=\"T_be19c_row14_col3\" class=\"data row14 col3\" >-16.938661</td>\n",
" <td id=\"T_be19c_row14_col4\" class=\"data row14 col4\" >0.000000</td>\n",
" <td id=\"T_be19c_row14_col5\" class=\"data row14 col5\" >-0.000413</td>\n",
" <td id=\"T_be19c_row14_col6\" class=\"data row14 col6\" >-0.000327</td>\n",
" <td id=\"T_be19c_row14_col7\" class=\"data row14 col7\" >16.938661</td>\n",
" </tr>\n",
" <tr>\n",
" <th id=\"T_be19c_level0_row15\" class=\"row_heading level0 row15\" >20</th>\n",
" <td id=\"T_be19c_row15_col0\" class=\"data row15 col0\" >DEST_HUB</td>\n",
" <td id=\"T_be19c_row15_col1\" class=\"data row15 col1\" >-0.394935</td>\n",
" <td id=\"T_be19c_row15_col2\" class=\"data row15 col2\" >0.026256</td>\n",
" <td id=\"T_be19c_row15_col3\" class=\"data row15 col3\" >-15.041459</td>\n",
" <td id=\"T_be19c_row15_col4\" class=\"data row15 col4\" >0.000000</td>\n",
" <td id=\"T_be19c_row15_col5\" class=\"data row15 col5\" >-0.446397</td>\n",
" <td id=\"T_be19c_row15_col6\" class=\"data row15 col6\" >-0.343473</td>\n",
" <td id=\"T_be19c_row15_col7\" class=\"data row15 col7\" >15.041459</td>\n",
" </tr>\n",
" <tr>\n",
" <th id=\"T_be19c_level0_row16\" class=\"row_heading level0 row16\" >17</th>\n",
" <td id=\"T_be19c_row16_col0\" class=\"data row16 col0\" >DEP_MONTH</td>\n",
" <td id=\"T_be19c_row16_col1\" class=\"data row16 col1\" >-0.039662</td>\n",
" <td id=\"T_be19c_row16_col2\" class=\"data row16 col2\" >0.002641</td>\n",
" <td id=\"T_be19c_row16_col3\" class=\"data row16 col3\" >-15.018808</td>\n",
" <td id=\"T_be19c_row16_col4\" class=\"data row16 col4\" >0.000000</td>\n",
" <td id=\"T_be19c_row16_col5\" class=\"data row16 col5\" >-0.044838</td>\n",
" <td id=\"T_be19c_row16_col6\" class=\"data row16 col6\" >-0.034486</td>\n",
" <td id=\"T_be19c_row16_col7\" class=\"data row16 col7\" >15.018808</td>\n",
" </tr>\n",
" <tr>\n",
" <th id=\"T_be19c_level0_row17\" class=\"row_heading level0 row17\" >6</th>\n",
" <td id=\"T_be19c_row17_col0\" class=\"data row17 col0\" >WHEELS_OFF</td>\n",
" <td id=\"T_be19c_row17_col1\" class=\"data row17 col1\" >-0.000647</td>\n",
" <td id=\"T_be19c_row17_col2\" class=\"data row17 col2\" >0.000067</td>\n",
" <td id=\"T_be19c_row17_col3\" class=\"data row17 col3\" >-9.646104</td>\n",
" <td id=\"T_be19c_row17_col4\" class=\"data row17 col4\" >0.000000</td>\n",
" <td id=\"T_be19c_row17_col5\" class=\"data row17 col5\" >-0.000778</td>\n",
" <td id=\"T_be19c_row17_col6\" class=\"data row17 col6\" >-0.000515</td>\n",
" <td id=\"T_be19c_row17_col7\" class=\"data row17 col7\" >9.646104</td>\n",
" </tr>\n",
" <tr>\n",
" <th id=\"T_be19c_level0_row18\" class=\"row_heading level0 row18\" >9</th>\n",
" <td id=\"T_be19c_row18_col0\" class=\"data row18 col0\" >DISTANCE</td>\n",
" <td id=\"T_be19c_row18_col1\" class=\"data row18 col1\" >0.000676</td>\n",
" <td id=\"T_be19c_row18_col2\" class=\"data row18 col2\" >0.000080</td>\n",
" <td id=\"T_be19c_row18_col3\" class=\"data row18 col3\" >8.428835</td>\n",
" <td id=\"T_be19c_row18_col4\" class=\"data row18 col4\" >0.000000</td>\n",
" <td id=\"T_be19c_row18_col5\" class=\"data row18 col5\" >0.000519</td>\n",
" <td id=\"T_be19c_row18_col6\" class=\"data row18 col6\" >0.000834</td>\n",
" <td id=\"T_be19c_row18_col7\" class=\"data row18 col7\" >8.428835</td>\n",
" </tr>\n",
" <tr>\n",
" <th id=\"T_be19c_level0_row19\" class=\"row_heading level0 row19\" >18</th>\n",
" <td id=\"T_be19c_row19_col0\" class=\"data row19 col0\" >DEP_DOW</td>\n",
" <td id=\"T_be19c_row19_col1\" class=\"data row19 col1\" >-0.017967</td>\n",
" <td id=\"T_be19c_row19_col2\" class=\"data row19 col2\" >0.004487</td>\n",
" <td id=\"T_be19c_row19_col3\" class=\"data row19 col3\" >-4.004561</td>\n",
" <td id=\"T_be19c_row19_col4\" class=\"data row19 col4\" >0.000062</td>\n",
" <td id=\"T_be19c_row19_col5\" class=\"data row19 col5\" >-0.026760</td>\n",
" <td id=\"T_be19c_row19_col6\" class=\"data row19 col6\" >-0.009173</td>\n",
" <td id=\"T_be19c_row19_col7\" class=\"data row19 col7\" >4.004561</td>\n",
" </tr>\n",
" <tr>\n",
" <th id=\"T_be19c_level0_row20\" class=\"row_heading level0 row20\" >11</th>\n",
" <td id=\"T_be19c_row20_col0\" class=\"data row20 col0\" >ARR_AFPH</td>\n",
" <td id=\"T_be19c_row20_col1\" class=\"data row20 col1\" >0.000548</td>\n",
" <td id=\"T_be19c_row20_col2\" class=\"data row20 col2\" >0.000332</td>\n",
" <td id=\"T_be19c_row20_col3\" class=\"data row20 col3\" >1.650788</td>\n",
" <td id=\"T_be19c_row20_col4\" class=\"data row20 col4\" >0.098782</td>\n",
" <td id=\"T_be19c_row20_col5\" class=\"data row20 col5\" >-0.000103</td>\n",
" <td id=\"T_be19c_row20_col6\" class=\"data row20 col6\" >0.001198</td>\n",
" <td id=\"T_be19c_row20_col7\" class=\"data row20 col7\" >1.650788</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n"
],
"text/plain": [
"<pandas.io.formats.style.Styler at 0x32ca95810>"
]
},
"execution_count": 35,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"import statsmodels.api as sm\n",
"\n",
"linreg_mdl = sm.OLS(y_train_reg, sm.add_constant(X_train))\n",
"linreg_mdl = linreg_mdl.fit()\n",
"summary_df = linreg_mdl.summary2().tables[1]\n",
"summary_df = (\n",
" summary_df.drop([\"const\"]).reset_index().rename(columns={\"index\": \"feature\"})\n",
")\n",
"summary_df[\"t_abs\"] = abs(summary_df[\"t\"])\n",
"summary_df.sort_values(by=\"t_abs\", ascending=False).style.background_gradient(\n",
" cmap=\"plasma_r\", low=0, high=0.1, subset=[\"P>|t|\"]\n",
").background_gradient(cmap=\"plasma_r\", low=0, high=0.1, subset=[\"t_abs\"])"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Гребневая регрессия"
]
},
{
"cell_type": "code",
"execution_count": 36,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<style type=\"text/css\">\n",
"#T_cfee5_row0_col1, #T_cfee5_row1_col1, #T_cfee5_row2_col2, #T_cfee5_row3_col1, #T_cfee5_row4_col1, #T_cfee5_row5_col2, #T_cfee5_row6_col1, #T_cfee5_row7_col2, #T_cfee5_row8_col1, #T_cfee5_row9_col2, #T_cfee5_row10_col1, #T_cfee5_row11_col2, #T_cfee5_row12_col1, #T_cfee5_row13_col2, #T_cfee5_row14_col2, #T_cfee5_row15_col2, #T_cfee5_row16_col2, #T_cfee5_row17_col2, #T_cfee5_row18_col1, #T_cfee5_row19_col1, #T_cfee5_row20_col1 {\n",
" background-color: #472f7d;\n",
" color: #f1f1f1;\n",
"}\n",
"#T_cfee5_row0_col2, #T_cfee5_row1_col2, #T_cfee5_row2_col1, #T_cfee5_row3_col2, #T_cfee5_row4_col2, #T_cfee5_row5_col1, #T_cfee5_row6_col2, #T_cfee5_row7_col1, #T_cfee5_row8_col2, #T_cfee5_row9_col1, #T_cfee5_row10_col2, #T_cfee5_row11_col1, #T_cfee5_row12_col2, #T_cfee5_row13_col1, #T_cfee5_row14_col1, #T_cfee5_row15_col1, #T_cfee5_row16_col1, #T_cfee5_row17_col1, #T_cfee5_row18_col2, #T_cfee5_row19_col2, #T_cfee5_row20_col2 {\n",
" background-color: #7ad151;\n",
" color: #000000;\n",
"}\n",
"</style>\n",
"<table id=\"T_cfee5\">\n",
" <thead>\n",
" <tr>\n",
" <th class=\"blank level0\" >&nbsp;</th>\n",
" <th id=\"T_cfee5_level0_col0\" class=\"col_heading level0 col0\" >feature</th>\n",
" <th id=\"T_cfee5_level0_col1\" class=\"col_heading level0 col1\" >coef_linear</th>\n",
" <th id=\"T_cfee5_level0_col2\" class=\"col_heading level0 col2\" >coef_ridge</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th id=\"T_cfee5_level0_row0\" class=\"row_heading level0 row0\" >0</th>\n",
" <td id=\"T_cfee5_row0_col0\" class=\"data row0 col0\" >CRS_DEP_TIME</td>\n",
" <td id=\"T_cfee5_row0_col1\" class=\"data row0 col1\" >0.004550</td>\n",
" <td id=\"T_cfee5_row0_col2\" class=\"data row0 col2\" >0.004275</td>\n",
" </tr>\n",
" <tr>\n",
" <th id=\"T_cfee5_level0_row1\" class=\"row_heading level0 row1\" >1</th>\n",
" <td id=\"T_cfee5_row1_col0\" class=\"data row1 col0\" >DEP_TIME</td>\n",
" <td id=\"T_cfee5_row1_col1\" class=\"data row1 col1\" >-0.005251</td>\n",
" <td id=\"T_cfee5_row1_col2\" class=\"data row1 col2\" >-0.005485</td>\n",
" </tr>\n",
" <tr>\n",
" <th id=\"T_cfee5_level0_row2\" class=\"row_heading level0 row2\" >2</th>\n",
" <td id=\"T_cfee5_row2_col0\" class=\"data row2 col0\" >DEP_DELAY</td>\n",
" <td id=\"T_cfee5_row2_col1\" class=\"data row2 col1\" >0.894126</td>\n",
" <td id=\"T_cfee5_row2_col2\" class=\"data row2 col2\" >0.894229</td>\n",
" </tr>\n",
" <tr>\n",
" <th id=\"T_cfee5_level0_row3\" class=\"row_heading level0 row3\" >3</th>\n",
" <td id=\"T_cfee5_row3_col0\" class=\"data row3 col0\" >DEP_AFPH</td>\n",
" <td id=\"T_cfee5_row3_col1\" class=\"data row3 col1\" >-0.015296</td>\n",
" <td id=\"T_cfee5_row3_col2\" class=\"data row3 col2\" >-0.015304</td>\n",
" </tr>\n",
" <tr>\n",
" <th id=\"T_cfee5_level0_row4\" class=\"row_heading level0 row4\" >4</th>\n",
" <td id=\"T_cfee5_row4_col0\" class=\"data row4 col0\" >DEP_RFPH</td>\n",
" <td id=\"T_cfee5_row4_col1\" class=\"data row4 col1\" >-0.469623</td>\n",
" <td id=\"T_cfee5_row4_col2\" class=\"data row4 col2\" >-0.469623</td>\n",
" </tr>\n",
" <tr>\n",
" <th id=\"T_cfee5_level0_row5\" class=\"row_heading level0 row5\" >5</th>\n",
" <td id=\"T_cfee5_row5_col0\" class=\"data row5 col0\" >TAXI_OUT</td>\n",
" <td id=\"T_cfee5_row5_col1\" class=\"data row5 col1\" >0.125278</td>\n",
" <td id=\"T_cfee5_row5_col2\" class=\"data row5 col2\" >0.125284</td>\n",
" </tr>\n",
" <tr>\n",
" <th id=\"T_cfee5_level0_row6\" class=\"row_heading level0 row6\" >6</th>\n",
" <td id=\"T_cfee5_row6_col0\" class=\"data row6 col0\" >WHEELS_OFF</td>\n",
" <td id=\"T_cfee5_row6_col1\" class=\"data row6 col1\" >-0.000647</td>\n",
" <td id=\"T_cfee5_row6_col2\" class=\"data row6 col2\" >-0.000889</td>\n",
" </tr>\n",
" <tr>\n",
" <th id=\"T_cfee5_level0_row7\" class=\"row_heading level0 row7\" >7</th>\n",
" <td id=\"T_cfee5_row7_col0\" class=\"data row7 col0\" >CRS_ELAPSED_TIME</td>\n",
" <td id=\"T_cfee5_row7_col1\" class=\"data row7 col1\" >-0.012624</td>\n",
" <td id=\"T_cfee5_row7_col2\" class=\"data row7 col2\" >-0.012618</td>\n",
" </tr>\n",
" <tr>\n",
" <th id=\"T_cfee5_level0_row8\" class=\"row_heading level0 row8\" >8</th>\n",
" <td id=\"T_cfee5_row8_col0\" class=\"data row8 col0\" >PCT_ELAPSED_TIME</td>\n",
" <td id=\"T_cfee5_row8_col1\" class=\"data row8 col1\" >45.011289</td>\n",
" <td id=\"T_cfee5_row8_col2\" class=\"data row8 col2\" >45.010279</td>\n",
" </tr>\n",
" <tr>\n",
" <th id=\"T_cfee5_level0_row9\" class=\"row_heading level0 row9\" >9</th>\n",
" <td id=\"T_cfee5_row9_col0\" class=\"data row9 col0\" >DISTANCE</td>\n",
" <td id=\"T_cfee5_row9_col1\" class=\"data row9 col1\" >0.000676</td>\n",
" <td id=\"T_cfee5_row9_col2\" class=\"data row9 col2\" >0.000718</td>\n",
" </tr>\n",
" <tr>\n",
" <th id=\"T_cfee5_level0_row10\" class=\"row_heading level0 row10\" >10</th>\n",
" <td id=\"T_cfee5_row10_col0\" class=\"data row10 col0\" >CRS_ARR_TIME</td>\n",
" <td id=\"T_cfee5_row10_col1\" class=\"data row10 col1\" >-0.000370</td>\n",
" <td id=\"T_cfee5_row10_col2\" class=\"data row10 col2\" >-0.000546</td>\n",
" </tr>\n",
" <tr>\n",
" <th id=\"T_cfee5_level0_row11\" class=\"row_heading level0 row11\" >11</th>\n",
" <td id=\"T_cfee5_row11_col0\" class=\"data row11 col0\" >ARR_AFPH</td>\n",
" <td id=\"T_cfee5_row11_col1\" class=\"data row11 col1\" >0.000548</td>\n",
" <td id=\"T_cfee5_row11_col2\" class=\"data row11 col2\" >0.000550</td>\n",
" </tr>\n",
" <tr>\n",
" <th id=\"T_cfee5_level0_row12\" class=\"row_heading level0 row12\" >12</th>\n",
" <td id=\"T_cfee5_row12_col0\" class=\"data row12 col0\" >ARR_RFPH</td>\n",
" <td id=\"T_cfee5_row12_col1\" class=\"data row12 col1\" >0.373867</td>\n",
" <td id=\"T_cfee5_row12_col2\" class=\"data row12 col2\" >0.373865</td>\n",
" </tr>\n",
" <tr>\n",
" <th id=\"T_cfee5_level0_row13\" class=\"row_heading level0 row13\" >13</th>\n",
" <td id=\"T_cfee5_row13_col0\" class=\"data row13 col0\" >WEATHER_DELAY</td>\n",
" <td id=\"T_cfee5_row13_col1\" class=\"data row13 col1\" >-0.906364</td>\n",
" <td id=\"T_cfee5_row13_col2\" class=\"data row13 col2\" >-0.906358</td>\n",
" </tr>\n",
" <tr>\n",
" <th id=\"T_cfee5_level0_row14\" class=\"row_heading level0 row14\" >14</th>\n",
" <td id=\"T_cfee5_row14_col0\" class=\"data row14 col0\" >NAS_DELAY</td>\n",
" <td id=\"T_cfee5_row14_col1\" class=\"data row14 col1\" >-0.674053</td>\n",
" <td id=\"T_cfee5_row14_col2\" class=\"data row14 col2\" >-0.674045</td>\n",
" </tr>\n",
" <tr>\n",
" <th id=\"T_cfee5_level0_row15\" class=\"row_heading level0 row15\" >15</th>\n",
" <td id=\"T_cfee5_row15_col0\" class=\"data row15 col0\" >SECURITY_DELAY</td>\n",
" <td id=\"T_cfee5_row15_col1\" class=\"data row15 col1\" >-0.917411</td>\n",
" <td id=\"T_cfee5_row15_col2\" class=\"data row15 col2\" >-0.917411</td>\n",
" </tr>\n",
" <tr>\n",
" <th id=\"T_cfee5_level0_row16\" class=\"row_heading level0 row16\" >16</th>\n",
" <td id=\"T_cfee5_row16_col0\" class=\"data row16 col0\" >LATE_AIRCRAFT_DELAY</td>\n",
" <td id=\"T_cfee5_row16_col1\" class=\"data row16 col1\" >-0.929844</td>\n",
" <td id=\"T_cfee5_row16_col2\" class=\"data row16 col2\" >-0.929805</td>\n",
" </tr>\n",
" <tr>\n",
" <th id=\"T_cfee5_level0_row17\" class=\"row_heading level0 row17\" >17</th>\n",
" <td id=\"T_cfee5_row17_col0\" class=\"data row17 col0\" >DEP_MONTH</td>\n",
" <td id=\"T_cfee5_row17_col1\" class=\"data row17 col1\" >-0.039662</td>\n",
" <td id=\"T_cfee5_row17_col2\" class=\"data row17 col2\" >-0.039661</td>\n",
" </tr>\n",
" <tr>\n",
" <th id=\"T_cfee5_level0_row18\" class=\"row_heading level0 row18\" >18</th>\n",
" <td id=\"T_cfee5_row18_col0\" class=\"data row18 col0\" >DEP_DOW</td>\n",
" <td id=\"T_cfee5_row18_col1\" class=\"data row18 col1\" >-0.017967</td>\n",
" <td id=\"T_cfee5_row18_col2\" class=\"data row18 col2\" >-0.017967</td>\n",
" </tr>\n",
" <tr>\n",
" <th id=\"T_cfee5_level0_row19\" class=\"row_heading level0 row19\" >19</th>\n",
" <td id=\"T_cfee5_row19_col0\" class=\"data row19 col0\" >ORIGIN_HUB</td>\n",
" <td id=\"T_cfee5_row19_col1\" class=\"data row19 col1\" >-1.029129</td>\n",
" <td id=\"T_cfee5_row19_col2\" class=\"data row19 col2\" >-1.029140</td>\n",
" </tr>\n",
" <tr>\n",
" <th id=\"T_cfee5_level0_row20\" class=\"row_heading level0 row20\" >20</th>\n",
" <td id=\"T_cfee5_row20_col0\" class=\"data row20 col0\" >DEST_HUB</td>\n",
" <td id=\"T_cfee5_row20_col1\" class=\"data row20 col1\" >-0.394935</td>\n",
" <td id=\"T_cfee5_row20_col2\" class=\"data row20 col2\" >-0.394948</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n"
],
"text/plain": [
"<pandas.io.formats.style.Styler at 0x17753cf50>"
]
},
"execution_count": 36,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"coefs_ridge = reg_models[\"ridge\"][\"fitted\"].coef_\n",
"coef_ridge_df = pd.DataFrame(\n",
" {\n",
" \"feature\": X_train.columns.values.tolist(),\n",
" \"coef_linear\": coefs_lm,\n",
" \"coef_ridge\": coefs_ridge,\n",
" }\n",
")\n",
"coef_ridge_df.style.background_gradient(cmap=\"viridis_r\", low=0.3, high=0.2, axis=1)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Полиномиальная регрессия"
]
},
{
"cell_type": "code",
"execution_count": 38,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"253"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/plain": [
"232"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"display(\n",
" reg_models[\"linear_poly\"][\"fitted\"].get_params()[\"linearregression\"].coef_.shape[0]\n",
")\n",
"display(\n",
" reg_models[\"linear_interact\"][\"fitted\"]\n",
" .get_params()[\"linearregression\"]\n",
" .coef_.shape[0]\n",
")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Логистическая регрессия"
]
},
{
"cell_type": "code",
"execution_count": 39,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"coefficients:\t[[-0.00132811 0.00034525 0.15746107 0.00349808 -0.00215053 -0.00445293\n",
" 0.00029184 -0.05167613 -0.00175222 0.0055682 -0.00031922 -0.00757532\n",
" -0.00273998 -0.15351444 -0.12133964 -0.00595224 -0.16451117 -0.01303235\n",
" -0.0052911 0.00048854 -0.00206977]]\n",
"intercept:\t[-0.00229272]\n"
]
},
{
"data": {
"text/plain": [
"DEP_DELAY 6.969920\n",
"CRS_ELAPSED_TIME 4.101834\n",
"LATE_AIRCRAFT_DELAY 4.065346\n",
"DISTANCE 3.616141\n",
"NAS_DELAY 1.672065\n",
"WEATHER_DELAY 1.604186\n",
"CRS_DEP_TIME 0.665926\n",
"ARR_AFPH 0.267888\n",
"DEP_TIME 0.177772\n",
"CRS_ARR_TIME 0.168589\n",
"WHEELS_OFF 0.150765\n",
"DEP_AFPH 0.124024\n",
"DEP_MONTH 0.044475\n",
"TAXI_OUT 0.043947\n",
"DEP_DOW 0.010574\n",
"SECURITY_DELAY 0.009756\n",
"ARR_RFPH 0.001976\n",
"DEP_RFPH 0.001215\n",
"DEST_HUB 0.001007\n",
"ORIGIN_HUB 0.000238\n",
"PCT_ELAPSED_TIME 0.000185\n",
"dtype: float64"
]
},
"execution_count": 39,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"coefs_log = class_models[\"logistic\"][\"fitted\"].coef_\n",
"intercept_log = class_models[\"logistic\"][\"fitted\"].intercept_\n",
"print(\"coefficients:\\t%s\" % coefs_log)\n",
"print(\"intercept:\\t%s\" % intercept_log)\n",
"stdv = np.std(X_train, 0)\n",
"abs(\n",
" coefs_log.reshape(\n",
" 21,\n",
" )\n",
" * stdv\n",
").sort_values(ascending=False)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Дерево решений"
]
},
{
"cell_type": "code",
"execution_count": 40,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"|--- DEP_DELAY <= 20.50\n",
"| |--- DEP_DELAY <= 15.50\n",
"| | |--- class: 0\n",
"| |--- DEP_DELAY > 15.50\n",
"| | |--- PCT_ELAPSED_TIME <= 0.99\n",
"| | | |--- PCT_ELAPSED_TIME <= 0.98\n",
"| | | | |--- PCT_ELAPSED_TIME <= 0.96\n",
"| | | | | |--- CRS_ELAPSED_TIME <= 65.50\n",
"| | | | | | |--- PCT_ELAPSED_TIME <= 0.94\n",
"| | | | | | | |--- class: 0\n",
"| | | | | | |--- PCT_ELAPSED_TIME > 0.94\n",
"| | | | | | | |--- class: 0\n",
"| | | | | |--- CRS_ELAPSED_TIME > 65.50\n",
"| | | | | | |--- PCT_ELAPSED_TIME <= 0.95\n",
"| | | | | | | |--- class: 0\n",
"| | | | | | |--- PCT_ELAPSED_TIME > 0.95\n",
"| | | | | | | |--- class: 0\n",
"| | | | |--- PCT_ELAPSED_TIME > 0.96\n",
"| | | | | |--- CRS_ELAPSED_TIME <= 140.50\n",
"| | | | | | |--- DEP_DELAY <= 18.50\n",
"| | | | | | | |--- class: 0\n",
"| | | | | | |--- DEP_DELAY > 18.50\n",
"| | | | | | | |--- class: 0\n",
"| | | | | |--- CRS_ELAPSED_TIME > 140.50\n",
"| | | | | | |--- DEP_DELAY <= 19.50\n",
"| | | | | | | |--- class: 0\n",
"| | | | | | |--- DEP_DELAY > 19.50\n",
"| | | | | | | |--- class: 0\n",
"| | | |--- PCT_ELAPSED_TIME > 0.98\n",
"| | | | |--- DEP_DELAY <= 18.50\n",
"| | | | | |--- DISTANCE <= 326.50\n",
"| | | | | | |--- LATE_AIRCRAFT_DELAY <= 0.50\n",
"| | | | | | | |--- class: 1\n",
"| | | | | | |--- LATE_AIRCRAFT_DELAY > 0.50\n",
"| | | | | | | |--- class: 0\n",
"| | | | | |--- DISTANCE > 326.50\n",
"| | | | | | |--- DEP_DELAY <= 17.50\n",
"| | | | | | | |--- class: 0\n",
"| | | | | | |--- DEP_DELAY > 17.50\n",
"| | | | | | | |--- class: 0\n",
"| | | | |--- DEP_DELAY > 18.50\n",
"| | | | | |--- LATE_AIRCRAFT_DELAY <= 1.50\n",
"| | | | | | |--- DISTANCE <= 1358.50\n",
"| | | | | | | |--- class: 1\n",
"| | | | | | |--- DISTANCE > 1358.50\n",
"| | | | | | | |--- class: 0\n",
"| | | | | |--- LATE_AIRCRAFT_DELAY > 1.50\n",
"| | | | | | |--- class: 0\n",
"| | |--- PCT_ELAPSED_TIME > 0.99\n",
"| | | |--- LATE_AIRCRAFT_DELAY <= 1.50\n",
"| | | | |--- WEATHER_DELAY <= 2.00\n",
"| | | | | |--- NAS_DELAY <= 17.50\n",
"| | | | | | |--- PCT_ELAPSED_TIME <= 1.00\n",
"| | | | | | | |--- class: 0\n",
"| | | | | | |--- PCT_ELAPSED_TIME > 1.00\n",
"| | | | | | | |--- class: 1\n",
"| | | | | |--- NAS_DELAY > 17.50\n",
"| | | | | | |--- PCT_ELAPSED_TIME <= 1.09\n",
"| | | | | | | |--- class: 0\n",
"| | | | | | |--- PCT_ELAPSED_TIME > 1.09\n",
"| | | | | | | |--- class: 1\n",
"| | | | |--- WEATHER_DELAY > 2.00\n",
"| | | | | |--- class: 0\n",
"| | | |--- LATE_AIRCRAFT_DELAY > 1.50\n",
"| | | | |--- LATE_AIRCRAFT_DELAY <= 3.50\n",
"| | | | | |--- DEP_DELAY <= 18.50\n",
"| | | | | | |--- DISTANCE <= 153.50\n",
"| | | | | | | |--- class: 1\n",
"| | | | | | |--- DISTANCE > 153.50\n",
"| | | | | | | |--- class: 0\n",
"| | | | | |--- DEP_DELAY > 18.50\n",
"| | | | | | |--- WEATHER_DELAY <= 2.50\n",
"| | | | | | | |--- class: 1\n",
"| | | | | | |--- WEATHER_DELAY > 2.50\n",
"| | | | | | | |--- class: 0\n",
"| | | | |--- LATE_AIRCRAFT_DELAY > 3.50\n",
"| | | | | |--- LATE_AIRCRAFT_DELAY <= 4.50\n",
"| | | | | | |--- DEP_DELAY <= 19.50\n",
"| | | | | | | |--- class: 0\n",
"| | | | | | |--- DEP_DELAY > 19.50\n",
"| | | | | | | |--- class: 1\n",
"| | | | | |--- LATE_AIRCRAFT_DELAY > 4.50\n",
"| | | | | | |--- class: 0\n",
"|--- DEP_DELAY > 20.50\n",
"| |--- LATE_AIRCRAFT_DELAY <= 11.50\n",
"| | |--- NAS_DELAY <= 27.50\n",
"| | | |--- DEP_DELAY <= 35.50\n",
"| | | | |--- PCT_ELAPSED_TIME <= 0.96\n",
"| | | | | |--- DEP_DELAY <= 28.50\n",
"| | | | | | |--- PCT_ELAPSED_TIME <= 0.93\n",
"| | | | | | | |--- class: 0\n",
"| | | | | | |--- PCT_ELAPSED_TIME > 0.93\n",
"| | | | | | | |--- class: 0\n",
"| | | | | |--- DEP_DELAY > 28.50\n",
"| | | | | | |--- PCT_ELAPSED_TIME <= 0.92\n",
"| | | | | | | |--- class: 0\n",
"| | | | | | |--- PCT_ELAPSED_TIME > 0.92\n",
"| | | | | | | |--- class: 1\n",
"| | | | |--- PCT_ELAPSED_TIME > 0.96\n",
"| | | | | |--- WEATHER_DELAY <= 4.50\n",
"| | | | | | |--- LATE_AIRCRAFT_DELAY <= 6.50\n",
"| | | | | | | |--- class: 1\n",
"| | | | | | |--- LATE_AIRCRAFT_DELAY > 6.50\n",
"| | | | | | | |--- class: 0\n",
"| | | | | |--- WEATHER_DELAY > 4.50\n",
"| | | | | | |--- WEATHER_DELAY <= 10.50\n",
"| | | | | | | |--- class: 0\n",
"| | | | | | |--- WEATHER_DELAY > 10.50\n",
"| | | | | | | |--- class: 0\n",
"| | | |--- DEP_DELAY > 35.50\n",
"| | | | |--- WEATHER_DELAY <= 16.50\n",
"| | | | | |--- DEP_DELAY <= 44.50\n",
"| | | | | | |--- PCT_ELAPSED_TIME <= 0.93\n",
"| | | | | | | |--- class: 1\n",
"| | | | | | |--- PCT_ELAPSED_TIME > 0.93\n",
"| | | | | | | |--- class: 1\n",
"| | | | | |--- DEP_DELAY > 44.50\n",
"| | | | | | |--- SECURITY_DELAY <= 20.50\n",
"| | | | | | | |--- class: 1\n",
"| | | | | | |--- SECURITY_DELAY > 20.50\n",
"| | | | | | | |--- class: 0\n",
"| | | | |--- WEATHER_DELAY > 16.50\n",
"| | | | | |--- WEATHER_DELAY <= 23.50\n",
"| | | | | | |--- DEP_DELAY <= 57.50\n",
"| | | | | | | |--- class: 0\n",
"| | | | | | |--- DEP_DELAY > 57.50\n",
"| | | | | | | |--- class: 1\n",
"| | | | | |--- WEATHER_DELAY > 23.50\n",
"| | | | | | |--- DEP_DELAY <= 88.50\n",
"| | | | | | | |--- class: 0\n",
"| | | | | | |--- DEP_DELAY > 88.50\n",
"| | | | | | | |--- class: 0\n",
"| | |--- NAS_DELAY > 27.50\n",
"| | | |--- PCT_ELAPSED_TIME <= 1.11\n",
"| | | | |--- NAS_DELAY <= 31.50\n",
"| | | | | |--- PCT_ELAPSED_TIME <= 1.07\n",
"| | | | | | |--- DEP_DELAY <= 69.00\n",
"| | | | | | | |--- class: 0\n",
"| | | | | | |--- DEP_DELAY > 69.00\n",
"| | | | | | | |--- class: 1\n",
"| | | | | |--- PCT_ELAPSED_TIME > 1.07\n",
"| | | | | | |--- WEATHER_DELAY <= 10.50\n",
"| | | | | | | |--- class: 1\n",
"| | | | | | |--- WEATHER_DELAY > 10.50\n",
"| | | | | | | |--- class: 0\n",
"| | | | |--- NAS_DELAY > 31.50\n",
"| | | | | |--- DEP_DELAY <= 471.50\n",
"| | | | | | |--- CRS_ELAPSED_TIME <= 420.00\n",
"| | | | | | | |--- class: 0\n",
"| | | | | | |--- CRS_ELAPSED_TIME > 420.00\n",
"| | | | | | | |--- class: 1\n",
"| | | | | |--- DEP_DELAY > 471.50\n",
"| | | | | | |--- NAS_DELAY <= 388.00\n",
"| | | | | | | |--- class: 1\n",
"| | | | | | |--- NAS_DELAY > 388.00\n",
"| | | | | | | |--- class: 0\n",
"| | | |--- PCT_ELAPSED_TIME > 1.11\n",
"| | | | |--- NAS_DELAY <= 64.50\n",
"| | | | | |--- WEATHER_DELAY <= 20.50\n",
"| | | | | | |--- DEP_DELAY <= 43.50\n",
"| | | | | | | |--- class: 1\n",
"| | | | | | |--- DEP_DELAY > 43.50\n",
"| | | | | | | |--- class: 1\n",
"| | | | | |--- WEATHER_DELAY > 20.50\n",
"| | | | | | |--- WHEELS_OFF <= 36.00\n",
"| | | | | | | |--- class: 1\n",
"| | | | | | |--- WHEELS_OFF > 36.00\n",
"| | | | | | | |--- class: 0\n",
"| | | | |--- NAS_DELAY > 64.50\n",
"| | | | | |--- PCT_ELAPSED_TIME <= 1.44\n",
"| | | | | | |--- NAS_DELAY <= 78.50\n",
"| | | | | | | |--- class: 0\n",
"| | | | | | |--- NAS_DELAY > 78.50\n",
"| | | | | | | |--- class: 0\n",
"| | | | | |--- PCT_ELAPSED_TIME > 1.44\n",
"| | | | | | |--- NAS_DELAY <= 119.50\n",
"| | | | | | | |--- class: 0\n",
"| | | | | | |--- NAS_DELAY > 119.50\n",
"| | | | | | | |--- class: 0\n",
"| |--- LATE_AIRCRAFT_DELAY > 11.50\n",
"| | |--- DEP_DELAY <= 75.50\n",
"| | | |--- DEP_DELAY <= 41.50\n",
"| | | | |--- LATE_AIRCRAFT_DELAY <= 14.50\n",
"| | | | | |--- DEP_DELAY <= 29.50\n",
"| | | | | | |--- DEP_DELAY <= 27.50\n",
"| | | | | | | |--- class: 0\n",
"| | | | | | |--- DEP_DELAY > 27.50\n",
"| | | | | | | |--- class: 0\n",
"| | | | | |--- DEP_DELAY > 29.50\n",
"| | | | | | |--- PCT_ELAPSED_TIME <= 0.97\n",
"| | | | | | | |--- class: 0\n",
"| | | | | | |--- PCT_ELAPSED_TIME > 0.97\n",
"| | | | | | | |--- class: 1\n",
"| | | | |--- LATE_AIRCRAFT_DELAY > 14.50\n",
"| | | | | |--- LATE_AIRCRAFT_DELAY <= 20.50\n",
"| | | | | | |--- DEP_DELAY <= 32.50\n",
"| | | | | | | |--- class: 0\n",
"| | | | | | |--- DEP_DELAY > 32.50\n",
"| | | | | | | |--- class: 0\n",
"| | | | | |--- LATE_AIRCRAFT_DELAY > 20.50\n",
"| | | | | | |--- DEP_DELAY <= 38.50\n",
"| | | | | | | |--- class: 0\n",
"| | | | | | |--- DEP_DELAY > 38.50\n",
"| | | | | | | |--- class: 0\n",
"| | | |--- DEP_DELAY > 41.50\n",
"| | | | |--- LATE_AIRCRAFT_DELAY <= 29.50\n",
"| | | | | |--- PCT_ELAPSED_TIME <= 0.94\n",
"| | | | | | |--- DEP_DELAY <= 55.50\n",
"| | | | | | | |--- class: 0\n",
"| | | | | | |--- DEP_DELAY > 55.50\n",
"| | | | | | | |--- class: 0\n",
"| | | | | |--- PCT_ELAPSED_TIME > 0.94\n",
"| | | | | | |--- WEATHER_DELAY <= 0.50\n",
"| | | | | | | |--- class: 1\n",
"| | | | | | |--- WEATHER_DELAY > 0.50\n",
"| | | | | | | |--- class: 0\n",
"| | | | |--- LATE_AIRCRAFT_DELAY > 29.50\n",
"| | | | | |--- LATE_AIRCRAFT_DELAY <= 38.50\n",
"| | | | | | |--- DEP_DELAY <= 59.50\n",
"| | | | | | | |--- class: 0\n",
"| | | | | | |--- DEP_DELAY > 59.50\n",
"| | | | | | | |--- class: 0\n",
"| | | | | |--- LATE_AIRCRAFT_DELAY > 38.50\n",
"| | | | | | |--- DEP_DELAY <= 60.50\n",
"| | | | | | | |--- class: 0\n",
"| | | | | | |--- DEP_DELAY > 60.50\n",
"| | | | | | | |--- class: 0\n",
"| | |--- DEP_DELAY > 75.50\n",
"| | | |--- LATE_AIRCRAFT_DELAY <= 60.50\n",
"| | | | |--- WEATHER_DELAY <= 0.50\n",
"| | | | | |--- NAS_DELAY <= 38.50\n",
"| | | | | | |--- DEP_DELAY <= 88.50\n",
"| | | | | | | |--- class: 1\n",
"| | | | | | |--- DEP_DELAY > 88.50\n",
"| | | | | | | |--- class: 1\n",
"| | | | | |--- NAS_DELAY > 38.50\n",
"| | | | | | |--- TAXI_OUT <= 63.50\n",
"| | | | | | | |--- class: 0\n",
"| | | | | | |--- TAXI_OUT > 63.50\n",
"| | | | | | | |--- class: 0\n",
"| | | | |--- WEATHER_DELAY > 0.50\n",
"| | | | | |--- WEATHER_DELAY <= 18.50\n",
"| | | | | | |--- LATE_AIRCRAFT_DELAY <= 31.50\n",
"| | | | | | | |--- class: 1\n",
"| | | | | | |--- LATE_AIRCRAFT_DELAY > 31.50\n",
"| | | | | | | |--- class: 0\n",
"| | | | | |--- WEATHER_DELAY > 18.50\n",
"| | | | | | |--- DEP_AFPH <= 99.64\n",
"| | | | | | | |--- class: 0\n",
"| | | | | | |--- DEP_AFPH > 99.64\n",
"| | | | | | | |--- class: 0\n",
"| | | |--- LATE_AIRCRAFT_DELAY > 60.50\n",
"| | | | |--- DEP_DELAY <= 114.50\n",
"| | | | | |--- LATE_AIRCRAFT_DELAY <= 71.50\n",
"| | | | | | |--- DEP_DELAY <= 95.50\n",
"| | | | | | | |--- class: 0\n",
"| | | | | | |--- DEP_DELAY > 95.50\n",
"| | | | | | | |--- class: 1\n",
"| | | | | |--- LATE_AIRCRAFT_DELAY > 71.50\n",
"| | | | | | |--- DEP_DELAY <= 96.50\n",
"| | | | | | | |--- class: 0\n",
"| | | | | | |--- DEP_DELAY > 96.50\n",
"| | | | | | | |--- class: 0\n",
"| | | | |--- DEP_DELAY > 114.50\n",
"| | | | | |--- LATE_AIRCRAFT_DELAY <= 98.50\n",
"| | | | | | |--- WEATHER_DELAY <= 1.00\n",
"| | | | | | | |--- class: 1\n",
"| | | | | | |--- WEATHER_DELAY > 1.00\n",
"| | | | | | | |--- class: 0\n",
"| | | | | |--- LATE_AIRCRAFT_DELAY > 98.50\n",
"| | | | | | |--- DEP_DELAY <= 171.50\n",
"| | | | | | | |--- class: 0\n",
"| | | | | | |--- DEP_DELAY > 171.50\n",
"| | | | | | | |--- class: 0\n",
"\n"
]
}
],
"source": [
"text_tree = tree.export_text(\n",
" class_models[\"decision_tree\"][\"fitted\"],\n",
" feature_names=X_train.columns.values.tolist(),\n",
")\n",
"print(text_tree)"
]
},
{
"cell_type": "code",
"execution_count": 41,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>feature</th>\n",
" <th>importance</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>DEP_DELAY</td>\n",
" <td>0.527482</td>\n",
" </tr>\n",
" <tr>\n",
" <th>16</th>\n",
" <td>LATE_AIRCRAFT_DELAY</td>\n",
" <td>0.199153</td>\n",
" </tr>\n",
" <tr>\n",
" <th>8</th>\n",
" <td>PCT_ELAPSED_TIME</td>\n",
" <td>0.105381</td>\n",
" </tr>\n",
" <tr>\n",
" <th>13</th>\n",
" <td>WEATHER_DELAY</td>\n",
" <td>0.101649</td>\n",
" </tr>\n",
" <tr>\n",
" <th>14</th>\n",
" <td>NAS_DELAY</td>\n",
" <td>0.062732</td>\n",
" </tr>\n",
" <tr>\n",
" <th>15</th>\n",
" <td>SECURITY_DELAY</td>\n",
" <td>0.001998</td>\n",
" </tr>\n",
" <tr>\n",
" <th>9</th>\n",
" <td>DISTANCE</td>\n",
" <td>0.001019</td>\n",
" </tr>\n",
" <tr>\n",
" <th>7</th>\n",
" <td>CRS_ELAPSED_TIME</td>\n",
" <td>0.000281</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5</th>\n",
" <td>TAXI_OUT</td>\n",
" <td>0.000239</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6</th>\n",
" <td>WHEELS_OFF</td>\n",
" <td>0.000035</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>DEP_AFPH</td>\n",
" <td>0.000031</td>\n",
" </tr>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>CRS_DEP_TIME</td>\n",
" <td>0.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>19</th>\n",
" <td>ORIGIN_HUB</td>\n",
" <td>0.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>18</th>\n",
" <td>DEP_DOW</td>\n",
" <td>0.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>17</th>\n",
" <td>DEP_MONTH</td>\n",
" <td>0.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>10</th>\n",
" <td>CRS_ARR_TIME</td>\n",
" <td>0.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>12</th>\n",
" <td>ARR_RFPH</td>\n",
" <td>0.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>11</th>\n",
" <td>ARR_AFPH</td>\n",
" <td>0.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>DEP_TIME</td>\n",
" <td>0.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>DEP_RFPH</td>\n",
" <td>0.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>20</th>\n",
" <td>DEST_HUB</td>\n",
" <td>0.000000</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" feature importance\n",
"2 DEP_DELAY 0.527482\n",
"16 LATE_AIRCRAFT_DELAY 0.199153\n",
"8 PCT_ELAPSED_TIME 0.105381\n",
"13 WEATHER_DELAY 0.101649\n",
"14 NAS_DELAY 0.062732\n",
"15 SECURITY_DELAY 0.001998\n",
"9 DISTANCE 0.001019\n",
"7 CRS_ELAPSED_TIME 0.000281\n",
"5 TAXI_OUT 0.000239\n",
"6 WHEELS_OFF 0.000035\n",
"3 DEP_AFPH 0.000031\n",
"0 CRS_DEP_TIME 0.000000\n",
"19 ORIGIN_HUB 0.000000\n",
"18 DEP_DOW 0.000000\n",
"17 DEP_MONTH 0.000000\n",
"10 CRS_ARR_TIME 0.000000\n",
"12 ARR_RFPH 0.000000\n",
"11 ARR_AFPH 0.000000\n",
"1 DEP_TIME 0.000000\n",
"4 DEP_RFPH 0.000000\n",
"20 DEST_HUB 0.000000"
]
},
"execution_count": 41,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"dt_imp_df = pd.DataFrame(\n",
" {\n",
" \"feature\": X_train.columns.values.tolist(),\n",
" \"importance\": class_models[\"decision_tree\"][\"fitted\"].feature_importances_,\n",
" }\n",
").sort_values(by=\"importance\", ascending=False)\n",
"dt_imp_df"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"k ближайших соседей"
]
},
{
"cell_type": "code",
"execution_count": 42,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"CRS_DEP_TIME 655.000000\n",
"DEP_TIME 1055.000000\n",
"DEP_DELAY 240.000000\n",
"DEP_AFPH 90.800000\n",
"DEP_RFPH 0.890196\n",
"TAXI_OUT 35.000000\n",
"WHEELS_OFF 1130.000000\n",
"CRS_ELAPSED_TIME 259.000000\n",
"PCT_ELAPSED_TIME 1.084942\n",
"DISTANCE 1660.000000\n",
"CRS_ARR_TIME 914.000000\n",
"ARR_AFPH 40.434783\n",
"ARR_RFPH 1.064073\n",
"WEATHER_DELAY 0.000000\n",
"NAS_DELAY 22.000000\n",
"SECURITY_DELAY 0.000000\n",
"LATE_AIRCRAFT_DELAY 221.000000\n",
"DEP_MONTH 10.000000\n",
"DEP_DOW 4.000000\n",
"ORIGIN_HUB 1.000000\n",
"DEST_HUB 0.000000\n",
"Name: 721043, dtype: float64\n"
]
}
],
"source": [
"print(X_test.loc[721043, :])"
]
},
{
"cell_type": "code",
"execution_count": 43,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"(array([[143.3160128 , 173.90740076, 192.66705727, 211.57109221,\n",
" 243.57211853, 259.61593993, 259.77507391]]),\n",
" array([[105172, 571912, 73409, 89450, 77474, 705972, 706911]]))"
]
},
"execution_count": 43,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"class_models[\"knn\"][\"fitted\"].kneighbors(\n",
" X_test.loc[721043, :].values.reshape(1, 21), 7\n",
")"
]
},
{
"cell_type": "code",
"execution_count": 44,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"3813 0\n",
"229062 1\n",
"283316 0\n",
"385831 0\n",
"581905 1\n",
"726784 1\n",
"179364 0\n",
"Name: CARRIER_DELAY, dtype: int64"
]
},
"execution_count": 44,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"y_train_class.iloc[[105172, 571912, 73409, 89450, 77474, 705972, 706911]]"
]
},
{
"cell_type": "code",
"execution_count": 45,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"'euclidean'"
]
},
"execution_count": 45,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"class_models[\"knn\"][\"fitted\"].effective_metric_"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Гауссов наивный Байес"
]
},
{
"cell_type": "code",
"execution_count": 46,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"array([0.93871674, 0.06128326])"
]
},
"execution_count": 46,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"class_models[\"naive_bayes\"][\"fitted\"].class_prior_"
]
},
{
"cell_type": "code",
"execution_count": 47,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"array([[2.50123026e+05, 2.61324730e+05, 9.21572605e+02, 1.26123968e+03,\n",
" 2.08339528e-01, 9.58074414e+01, 2.62606651e+05, 6.30102550e+03,\n",
" 1.13475535e-02, 4.22470414e+05, 2.75433641e+05, 1.25314386e+03,\n",
" 3.48655340e-01, 1.11234714e+02, 1.91877186e+02, 2.80302201e+00,\n",
" 5.06561612e+02, 1.17346654e+01, 3.99122491e+00, 2.39015406e-01,\n",
" 2.34996222e-01],\n",
" [2.60629652e+05, 2.96009867e+05, 1.19307931e+04, 1.14839167e+03,\n",
" 1.99929921e+00, 1.20404927e+02, 3.08568277e+05, 6.29066219e+03,\n",
" 1.38936741e-02, 4.10198938e+05, 3.28574000e+05, 1.09023147e+03,\n",
" 3.08997044e+00, 7.79140423e+01, 1.56184090e+02, 9.12112286e-01,\n",
" 2.11279954e+03, 1.02712368e+01, 4.02943162e+00, 1.77750796e-01,\n",
" 2.50208354e-01]])"
]
},
"execution_count": 47,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"class_models[\"naive_bayes\"][\"fitted\"].var_"
]
},
{
"cell_type": "code",
"execution_count": 48,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"array([[1.30740577e+03, 1.31006271e+03, 5.14196506e+00, 5.45864877e+01,\n",
" 1.09377996e+00, 1.87120810e+01, 1.33552258e+03, 1.70734929e+02,\n",
" 9.71131781e-01, 1.01824369e+03, 1.48438931e+03, 5.39873058e+01,\n",
" 1.09644787e+00, 7.39971299e-01, 2.85434558e+00, 2.41814585e-02,\n",
" 4.14674395e+00, 6.55045281e+00, 2.95035528e+00, 6.06800513e-01,\n",
" 6.24199571e-01],\n",
" [1.41305545e+03, 1.48087887e+03, 8.45867640e+01, 6.14731036e+01,\n",
" 1.25429654e+00, 1.99378321e+01, 1.49409412e+03, 1.72229998e+02,\n",
" 9.83974416e-01, 1.04363666e+03, 1.54821862e+03, 4.26486417e+01,\n",
" 1.36373798e+00, 4.50733082e-01, 4.71991378e+00, 2.11281132e-02,\n",
" 1.40744819e+01, 6.73367907e+00, 3.04251232e+00, 7.69575517e-01,\n",
" 4.85391724e-01]])"
]
},
"execution_count": 48,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"class_models[\"naive_bayes\"][\"fitted\"].theta_"
]
}
],
"metadata": {
"colab": {
"provenance": []
},
"kernelspec": {
"display_name": ".venv (3.11.12)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.12"
}
},
"nbformat": 4,
"nbformat_minor": 0
}