{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Загрузка данных в DataFrame"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<class 'pandas.core.frame.DataFrame'>\n",
      "Index: 891 entries, 1 to 891\n",
      "Data columns (total 11 columns):\n",
      " #   Column    Non-Null Count  Dtype  \n",
      "---  ------    --------------  -----  \n",
      " 0   Survived  891 non-null    int64  \n",
      " 1   Pclass    891 non-null    int64  \n",
      " 2   Name      891 non-null    object \n",
      " 3   Sex       891 non-null    object \n",
      " 4   Age       714 non-null    float64\n",
      " 5   SibSp     891 non-null    int64  \n",
      " 6   Parch     891 non-null    int64  \n",
      " 7   Ticket    891 non-null    object \n",
      " 8   Fare      891 non-null    float64\n",
      " 9   Cabin     204 non-null    object \n",
      " 10  Embarked  889 non-null    object \n",
      "dtypes: float64(2), int64(4), object(5)\n",
      "memory usage: 83.5+ KB\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "(891, 11)"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Survived</th>\n",
       "      <th>Pclass</th>\n",
       "      <th>Name</th>\n",
       "      <th>Sex</th>\n",
       "      <th>Age</th>\n",
       "      <th>SibSp</th>\n",
       "      <th>Parch</th>\n",
       "      <th>Ticket</th>\n",
       "      <th>Fare</th>\n",
       "      <th>Cabin</th>\n",
       "      <th>Embarked</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>PassengerId</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>0</td>\n",
       "      <td>3</td>\n",
       "      <td>Braund, Mr. Owen Harris</td>\n",
       "      <td>male</td>\n",
       "      <td>22.0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>A/5 21171</td>\n",
       "      <td>7.2500</td>\n",
       "      <td>NaN</td>\n",
       "      <td>S</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>Cumings, Mrs. John Bradley (Florence Briggs Th...</td>\n",
       "      <td>female</td>\n",
       "      <td>38.0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>PC 17599</td>\n",
       "      <td>71.2833</td>\n",
       "      <td>C85</td>\n",
       "      <td>C</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>1</td>\n",
       "      <td>3</td>\n",
       "      <td>Heikkinen, Miss. Laina</td>\n",
       "      <td>female</td>\n",
       "      <td>26.0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>STON/O2. 3101282</td>\n",
       "      <td>7.9250</td>\n",
       "      <td>NaN</td>\n",
       "      <td>S</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>Futrelle, Mrs. Jacques Heath (Lily May Peel)</td>\n",
       "      <td>female</td>\n",
       "      <td>35.0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>113803</td>\n",
       "      <td>53.1000</td>\n",
       "      <td>C123</td>\n",
       "      <td>S</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>0</td>\n",
       "      <td>3</td>\n",
       "      <td>Allen, Mr. William Henry</td>\n",
       "      <td>male</td>\n",
       "      <td>35.0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>373450</td>\n",
       "      <td>8.0500</td>\n",
       "      <td>NaN</td>\n",
       "      <td>S</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "             Survived  Pclass  \\\n",
       "PassengerId                     \n",
       "1                   0       3   \n",
       "2                   1       1   \n",
       "3                   1       3   \n",
       "4                   1       1   \n",
       "5                   0       3   \n",
       "\n",
       "                                                          Name     Sex   Age  \\\n",
       "PassengerId                                                                    \n",
       "1                                      Braund, Mr. Owen Harris    male  22.0   \n",
       "2            Cumings, Mrs. John Bradley (Florence Briggs Th...  female  38.0   \n",
       "3                                       Heikkinen, Miss. Laina  female  26.0   \n",
       "4                 Futrelle, Mrs. Jacques Heath (Lily May Peel)  female  35.0   \n",
       "5                                     Allen, Mr. William Henry    male  35.0   \n",
       "\n",
       "             SibSp  Parch            Ticket     Fare Cabin Embarked  \n",
       "PassengerId                                                          \n",
       "1                1      0         A/5 21171   7.2500   NaN        S  \n",
       "2                1      0          PC 17599  71.2833   C85        C  \n",
       "3                0      0  STON/O2. 3101282   7.9250   NaN        S  \n",
       "4                1      0            113803  53.1000  C123        S  \n",
       "5                0      0            373450   8.0500   NaN        S  "
      ]
     },
     "execution_count": 1,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import pandas as pd\n",
    "\n",
    "df = pd.read_csv(\"data/titanic.csv\", index_col=\"PassengerId\")\n",
    "\n",
    "df.info()\n",
    "\n",
    "display(df.shape)\n",
    "\n",
    "df.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Получение сведений о пропущенных данных"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Типы пропущенных данных:\n",
    "- None - представление пустых данных в Python\n",
    "- NaN - представление пустых данных в Pandas\n",
    "- '' - пустая строка"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Survived      0\n",
       "Pclass        0\n",
       "Name          0\n",
       "Sex           0\n",
       "Age         177\n",
       "SibSp         0\n",
       "Parch         0\n",
       "Ticket        0\n",
       "Fare          0\n",
       "Cabin       687\n",
       "Embarked      2\n",
       "dtype: int64"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/plain": [
       "Survived    False\n",
       "Pclass      False\n",
       "Name        False\n",
       "Sex         False\n",
       "Age          True\n",
       "SibSp       False\n",
       "Parch       False\n",
       "Ticket      False\n",
       "Fare        False\n",
       "Cabin        True\n",
       "Embarked     True\n",
       "dtype: bool"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/plain": [
       "'Age процент пустых значений: %19.87'"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/plain": [
       "'Cabin процент пустых значений: %77.10'"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/plain": [
       "'Embarked процент пустых значений: %0.22'"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "# Количество пустых значений признаков\n",
    "display(df.isnull().sum())\n",
    "display()\n",
    "\n",
    "# Есть ли пустые значения признаков\n",
    "display(df.isnull().any())\n",
    "display()\n",
    "\n",
    "# Процент пустых значений признаков\n",
    "for i in df.columns:\n",
    "    null_rate = df[i].isnull().sum() / len(df) * 100\n",
    "    if null_rate > 0:\n",
    "        display(f\"{i} процент пустых значений: %{null_rate:.2f}\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Заполнение пропущенных данных\n",
    "\n",
    "https://pythonmldaily.com/posts/pandas-dataframes-search-drop-empty-values\n",
    "\n",
    "https://scales.arabpsychology.com/stats/how-to-fill-nan-values-with-median-in-pandas/"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(891, 11)"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/plain": [
       "Survived    False\n",
       "Pclass      False\n",
       "Name        False\n",
       "Sex         False\n",
       "Age         False\n",
       "SibSp       False\n",
       "Parch       False\n",
       "Ticket      False\n",
       "Fare        False\n",
       "Cabin       False\n",
       "Embarked    False\n",
       "dtype: bool"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Survived</th>\n",
       "      <th>Pclass</th>\n",
       "      <th>Name</th>\n",
       "      <th>Sex</th>\n",
       "      <th>Age</th>\n",
       "      <th>SibSp</th>\n",
       "      <th>Parch</th>\n",
       "      <th>Ticket</th>\n",
       "      <th>Fare</th>\n",
       "      <th>Cabin</th>\n",
       "      <th>Embarked</th>\n",
       "      <th>AgeFillNA</th>\n",
       "      <th>AgeFillMedian</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>PassengerId</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>887</th>\n",
       "      <td>0</td>\n",
       "      <td>2</td>\n",
       "      <td>Montvila, Rev. Juozas</td>\n",
       "      <td>male</td>\n",
       "      <td>27.0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>211536</td>\n",
       "      <td>13.00</td>\n",
       "      <td>NaN</td>\n",
       "      <td>S</td>\n",
       "      <td>27.0</td>\n",
       "      <td>27.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>888</th>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>Graham, Miss. Margaret Edith</td>\n",
       "      <td>female</td>\n",
       "      <td>19.0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>112053</td>\n",
       "      <td>30.00</td>\n",
       "      <td>B42</td>\n",
       "      <td>S</td>\n",
       "      <td>19.0</td>\n",
       "      <td>19.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>889</th>\n",
       "      <td>0</td>\n",
       "      <td>3</td>\n",
       "      <td>Johnston, Miss. Catherine Helen \"Carrie\"</td>\n",
       "      <td>female</td>\n",
       "      <td>NaN</td>\n",
       "      <td>1</td>\n",
       "      <td>2</td>\n",
       "      <td>W./C. 6607</td>\n",
       "      <td>23.45</td>\n",
       "      <td>NaN</td>\n",
       "      <td>S</td>\n",
       "      <td>0.0</td>\n",
       "      <td>28.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>890</th>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>Behr, Mr. Karl Howell</td>\n",
       "      <td>male</td>\n",
       "      <td>26.0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>111369</td>\n",
       "      <td>30.00</td>\n",
       "      <td>C148</td>\n",
       "      <td>C</td>\n",
       "      <td>26.0</td>\n",
       "      <td>26.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>891</th>\n",
       "      <td>0</td>\n",
       "      <td>3</td>\n",
       "      <td>Dooley, Mr. Patrick</td>\n",
       "      <td>male</td>\n",
       "      <td>32.0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>370376</td>\n",
       "      <td>7.75</td>\n",
       "      <td>NaN</td>\n",
       "      <td>Q</td>\n",
       "      <td>32.0</td>\n",
       "      <td>32.0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "             Survived  Pclass                                      Name  \\\n",
       "PassengerId                                                               \n",
       "887                 0       2                     Montvila, Rev. Juozas   \n",
       "888                 1       1              Graham, Miss. Margaret Edith   \n",
       "889                 0       3  Johnston, Miss. Catherine Helen \"Carrie\"   \n",
       "890                 1       1                     Behr, Mr. Karl Howell   \n",
       "891                 0       3                       Dooley, Mr. Patrick   \n",
       "\n",
       "                Sex   Age  SibSp  Parch      Ticket   Fare Cabin Embarked  \\\n",
       "PassengerId                                                                 \n",
       "887            male  27.0      0      0      211536  13.00   NaN        S   \n",
       "888          female  19.0      0      0      112053  30.00   B42        S   \n",
       "889          female   NaN      1      2  W./C. 6607  23.45   NaN        S   \n",
       "890            male  26.0      0      0      111369  30.00  C148        C   \n",
       "891            male  32.0      0      0      370376   7.75   NaN        Q   \n",
       "\n",
       "             AgeFillNA  AgeFillMedian  \n",
       "PassengerId                            \n",
       "887               27.0           27.0  \n",
       "888               19.0           19.0  \n",
       "889                0.0           28.0  \n",
       "890               26.0           26.0  \n",
       "891               32.0           32.0  "
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "fillna_df = df.fillna(0)\n",
    "\n",
    "display(fillna_df.shape)\n",
    "\n",
    "display(fillna_df.isnull().any())\n",
    "\n",
    "# Замена пустых данных на 0\n",
    "df[\"AgeFillNA\"] = df[\"Age\"].fillna(0)\n",
    "\n",
    "# Замена пустых данных на медиану\n",
    "df[\"AgeFillMedian\"] = df[\"Age\"].fillna(df[\"Age\"].median())\n",
    "\n",
    "df.tail()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Survived</th>\n",
       "      <th>Pclass</th>\n",
       "      <th>Name</th>\n",
       "      <th>Sex</th>\n",
       "      <th>Age</th>\n",
       "      <th>SibSp</th>\n",
       "      <th>Parch</th>\n",
       "      <th>Ticket</th>\n",
       "      <th>Fare</th>\n",
       "      <th>Cabin</th>\n",
       "      <th>Embarked</th>\n",
       "      <th>AgeFillNA</th>\n",
       "      <th>AgeFillMedian</th>\n",
       "      <th>AgeCopy</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>PassengerId</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>887</th>\n",
       "      <td>0</td>\n",
       "      <td>2</td>\n",
       "      <td>Montvila, Rev. Juozas</td>\n",
       "      <td>male</td>\n",
       "      <td>27.0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>211536</td>\n",
       "      <td>13.00</td>\n",
       "      <td>NaN</td>\n",
       "      <td>S</td>\n",
       "      <td>27.0</td>\n",
       "      <td>27.0</td>\n",
       "      <td>27.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>888</th>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>Graham, Miss. Margaret Edith</td>\n",
       "      <td>female</td>\n",
       "      <td>19.0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>112053</td>\n",
       "      <td>30.00</td>\n",
       "      <td>B42</td>\n",
       "      <td>S</td>\n",
       "      <td>19.0</td>\n",
       "      <td>19.0</td>\n",
       "      <td>19.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>889</th>\n",
       "      <td>0</td>\n",
       "      <td>3</td>\n",
       "      <td>Johnston, Miss. Catherine Helen \"Carrie\"</td>\n",
       "      <td>female</td>\n",
       "      <td>NaN</td>\n",
       "      <td>1</td>\n",
       "      <td>2</td>\n",
       "      <td>W./C. 6607</td>\n",
       "      <td>23.45</td>\n",
       "      <td>NaN</td>\n",
       "      <td>S</td>\n",
       "      <td>0.0</td>\n",
       "      <td>28.0</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>890</th>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>Behr, Mr. Karl Howell</td>\n",
       "      <td>male</td>\n",
       "      <td>26.0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>111369</td>\n",
       "      <td>30.00</td>\n",
       "      <td>C148</td>\n",
       "      <td>C</td>\n",
       "      <td>26.0</td>\n",
       "      <td>26.0</td>\n",
       "      <td>26.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>891</th>\n",
       "      <td>0</td>\n",
       "      <td>3</td>\n",
       "      <td>Dooley, Mr. Patrick</td>\n",
       "      <td>male</td>\n",
       "      <td>32.0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>370376</td>\n",
       "      <td>7.75</td>\n",
       "      <td>NaN</td>\n",
       "      <td>Q</td>\n",
       "      <td>32.0</td>\n",
       "      <td>32.0</td>\n",
       "      <td>32.0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "             Survived  Pclass                                      Name  \\\n",
       "PassengerId                                                               \n",
       "887                 0       2                     Montvila, Rev. Juozas   \n",
       "888                 1       1              Graham, Miss. Margaret Edith   \n",
       "889                 0       3  Johnston, Miss. Catherine Helen \"Carrie\"   \n",
       "890                 1       1                     Behr, Mr. Karl Howell   \n",
       "891                 0       3                       Dooley, Mr. Patrick   \n",
       "\n",
       "                Sex   Age  SibSp  Parch      Ticket   Fare Cabin Embarked  \\\n",
       "PassengerId                                                                 \n",
       "887            male  27.0      0      0      211536  13.00   NaN        S   \n",
       "888          female  19.0      0      0      112053  30.00   B42        S   \n",
       "889          female   NaN      1      2  W./C. 6607  23.45   NaN        S   \n",
       "890            male  26.0      0      0      111369  30.00  C148        C   \n",
       "891            male  32.0      0      0      370376   7.75   NaN        Q   \n",
       "\n",
       "             AgeFillNA  AgeFillMedian  AgeCopy  \n",
       "PassengerId                                     \n",
       "887               27.0           27.0     27.0  \n",
       "888               19.0           19.0     19.0  \n",
       "889                0.0           28.0      0.0  \n",
       "890               26.0           26.0     26.0  \n",
       "891               32.0           32.0     32.0  "
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df[\"AgeCopy\"] = df[\"Age\"]\n",
    "\n",
    "# Замена данных сразу в DataFrame без копирования\n",
    "df.fillna({\"AgeCopy\": 0}, inplace=True)\n",
    "\n",
    "df.tail()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Удаление наблюдений с пропусками"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(183, 14)"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/plain": [
       "Survived    False\n",
       "Pclass      False\n",
       "Name        False\n",
       "Sex         False\n",
       "Age         False\n",
       "SibSp       False\n",
       "Parch       False\n",
       "Ticket      False\n",
       "Fare        False\n",
       "Cabin       False\n",
       "Embarked    False\n",
       "dtype: bool"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "dropna_df = df.dropna()\n",
    "\n",
    "display(dropna_df.shape)\n",
    "\n",
    "display(fillna_df.isnull().any())"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Создание выборок данных\n",
    "\n",
    "Библиотека scikit-learn\n",
    "\n",
    "https://scikit-learn.org/stable/index.html"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "<img src=\"assets/lec2-split.png\" width=\"600\" style=\"background-color: white\">"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Pclass\n",
       "3    491\n",
       "1    216\n",
       "2    184\n",
       "Name: count, dtype: int64"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/plain": [
       "'Обучающая выборка: '"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/plain": [
       "(534, 3)"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/plain": [
       "Pclass\n",
       "3    294\n",
       "1    130\n",
       "2    110\n",
       "Name: count, dtype: int64"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/plain": [
       "'Контрольная выборка: '"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/plain": [
       "(178, 3)"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/plain": [
       "Pclass\n",
       "3    98\n",
       "1    43\n",
       "2    37\n",
       "Name: count, dtype: int64"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/plain": [
       "'Тестовая выборка: '"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/plain": [
       "(179, 3)"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/plain": [
       "Pclass\n",
       "3    99\n",
       "1    43\n",
       "2    37\n",
       "Name: count, dtype: int64"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "# Вывод распределения количества наблюдений по меткам (классам)\n",
    "from src.utils import split_stratified_into_train_val_test\n",
    "\n",
    "\n",
    "display(df.Pclass.value_counts())\n",
    "display()\n",
    "\n",
    "data = df[[\"Pclass\", \"Survived\", \"AgeFillMedian\"]].copy()\n",
    "\n",
    "df_train, df_val, df_test, y_train, y_val, y_test = split_stratified_into_train_val_test(\n",
    "   data, stratify_colname=\"Pclass\", frac_train=0.60, frac_val=0.20, frac_test=0.20\n",
    ")\n",
    "\n",
    "display(\"Обучающая выборка: \", df_train.shape)\n",
    "display(df_train.Pclass.value_counts())\n",
    "\n",
    "display(\"Контрольная выборка: \", df_val.shape)\n",
    "display(df_val.Pclass.value_counts())\n",
    "\n",
    "display(\"Тестовая выборка: \", df_test.shape)\n",
    "display(df_test.Pclass.value_counts())"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Выборка с избытком (oversampling)\n",
    "\n",
    "https://www.blog.trainindata.com/oversampling-techniques-for-imbalanced-data/\n",
    "\n",
    "https://datacrayon.com/machine-learning/class-imbalance-and-oversampling/\n",
    "\n",
    "Выборка с недостатком (undersampling)\n",
    "\n",
    "https://machinelearningmastery.com/random-oversampling-and-undersampling-for-imbalanced-classification/\n",
    "\n",
    "Библиотека imbalanced-learn\n",
    "\n",
    "https://imbalanced-learn.org/stable/"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'Обучающая выборка: '"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/plain": [
       "(534, 3)"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/plain": [
       "Pclass\n",
       "3    294\n",
       "1    130\n",
       "2    110\n",
       "Name: count, dtype: int64"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/plain": [
       "'Обучающая выборка после oversampling: '"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/plain": [
       "(860, 3)"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/plain": [
       "Pclass\n",
       "3    294\n",
       "2    288\n",
       "1    278\n",
       "Name: count, dtype: int64"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Pclass</th>\n",
       "      <th>Survived</th>\n",
       "      <th>AgeFillMedian</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>3</td>\n",
       "      <td>0</td>\n",
       "      <td>28.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>3</td>\n",
       "      <td>0</td>\n",
       "      <td>30.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>41.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>3</td>\n",
       "      <td>0</td>\n",
       "      <td>16.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>3</td>\n",
       "      <td>1</td>\n",
       "      <td>32.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>855</th>\n",
       "      <td>2</td>\n",
       "      <td>0</td>\n",
       "      <td>25.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>856</th>\n",
       "      <td>2</td>\n",
       "      <td>1</td>\n",
       "      <td>49.895392</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>857</th>\n",
       "      <td>2</td>\n",
       "      <td>0</td>\n",
       "      <td>49.548159</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>858</th>\n",
       "      <td>2</td>\n",
       "      <td>0</td>\n",
       "      <td>49.410133</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>859</th>\n",
       "      <td>2</td>\n",
       "      <td>0</td>\n",
       "      <td>50.944726</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>860 rows × 3 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "     Pclass  Survived  AgeFillMedian\n",
       "0         3         0      28.000000\n",
       "1         3         0      30.000000\n",
       "2         1         1      41.000000\n",
       "3         3         0      16.000000\n",
       "4         3         1      32.000000\n",
       "..      ...       ...            ...\n",
       "855       2         0      25.000000\n",
       "856       2         1      49.895392\n",
       "857       2         0      49.548159\n",
       "858       2         0      49.410133\n",
       "859       2         0      50.944726\n",
       "\n",
       "[860 rows x 3 columns]"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from imblearn.over_sampling import ADASYN\n",
    "\n",
    "ada = ADASYN()\n",
    "\n",
    "display(\"Обучающая выборка: \", df_train.shape)\n",
    "display(df_train.Pclass.value_counts())\n",
    "\n",
    "X_resampled, y_resampled = ada.fit_resample(df_train, df_train[\"Pclass\"]) # type: ignore\n",
    "df_train_adasyn = pd.DataFrame(X_resampled)\n",
    "\n",
    "display(\"Обучающая выборка после oversampling: \", df_train_adasyn.shape)\n",
    "display(df_train_adasyn.Pclass.value_counts())\n",
    "\n",
    "df_train_adasyn"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": ".venv",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.12.7"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}