{ "cells": [ { "cell_type": "markdown", "id": "0fd64751", "metadata": {}, "source": [ "#### Инициализация Keras" ] }, { "cell_type": "code", "execution_count": 12, "id": "507915ea", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "3.9.2\n" ] } ], "source": [ "import os\n", "\n", "os.environ[\"KERAS_BACKEND\"] = \"torch\"\n", "import keras\n", "\n", "print(keras.__version__)" ] }, { "cell_type": "markdown", "id": "c5e00991", "metadata": {}, "source": [ "#### Загрузка данных для классификации с помощью глубоких сетей\n", "\n", "В качестве набора данных используется набор отзывов к фильмам с сайта IMDB.\n", "\n", "Набор включает 50 000 отзывов, половина из которых находится в обучающем наборе данных (x_train), а половина - в тестовом (x_valid). \n", "\n", "Данные уже предобработаны для простоты работы с ними.\n", "\n", "unique_words - в векторное пространство включается только слова, которые встречаются в корпусе не менее 5000 раз.\n", "\n", "max_length - максимальная длина отзыва (если больше, то обрезается, если меньше, то дополняется \"пустыми\" словами)." ] }, { "cell_type": "code", "execution_count": 13, "id": "e0043e5c", "metadata": {}, "outputs": [], "source": [ "from keras.api.datasets import imdb\n", "import os\n", "\n", "unique_words = 5000\n", "max_length = 100\n", "\n", "output_dir = \"tmp\"\n", "if not os.path.exists(output_dir):\n", " os.makedirs(output_dir)\n", "\n", "(X_train, y_train), (X_valid, y_valid) = imdb.load_data(num_words=unique_words, skip_top=50)" ] }, { "cell_type": "markdown", "id": "022cf1f8", "metadata": {}, "source": [ "#### Исследование набора данных\n", "\n", "Все слова закодированы числовыми идентификаторами для снижения расхода памяти\n", "\n", "Идентификаторы 0, 1 и 2 зарезервированы:\n", "- 0 (PAD) - заполняющее (\"пустое\") слово для дополнения отзывов до длины 100;\n", "- 1 (START) - определяет начло отзыва;\n", "- 2 (UNK) - отфильтрованные при загрузке отзывов слова (редкие слова или стоп-слов).\n", "\n", "Далее идентификаторы определяют слова в порядке снижения частоты их встречаемости в корпусе." ] }, { "cell_type": "code", "execution_count": 14, "id": "aadc3471", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "{34704: 'fawn',\n", " 52009: 'tsukino',\n", " 52010: 'nunnery',\n", " 16819: 'sonja',\n", " 63954: 'vani',\n", " 1411: 'woods',\n", " 16118: 'spiders',\n", " 2348: 'hanging',\n", " 2292: 'woody',\n", " 52011: 'trawling',\n", " 52012: \"hold's\",\n", " 11310: 'comically',\n", " 40833: 'localized',\n", " 30571: 'disobeying',\n", " 52013: \"'royale\",\n", " 40834: \"harpo's\",\n", " 52014: 'canet',\n", " 19316: 'aileen',\n", " 52015: 'acurately',\n", " 52016: \"diplomat's\",\n", " 25245: 'rickman',\n", " 6749: 'arranged',\n", " 52017: 'rumbustious',\n", " 52018: 'familiarness',\n", " 52019: \"spider'\",\n", " 68807: 'hahahah',\n", " 52020: \"wood'\",\n", " 40836: 'transvestism',\n", " 34705: \"hangin'\",\n", " 2341: 'bringing',\n", " 40837: 'seamier',\n", " 34706: 'wooded',\n", " 52021: 'bravora',\n", " 16820: 'grueling',\n", " 1639: 'wooden',\n", " 16821: 'wednesday',\n", " 52022: \"'prix\",\n", " 34707: 'altagracia',\n", " 52023: 'circuitry',\n", " 11588: 'crotch',\n", " 57769: 'busybody',\n", " 52024: \"tart'n'tangy\",\n", " 14132: 'burgade',\n", " 52026: 'thrace',\n", " 11041: \"tom's\",\n", " 52028: 'snuggles',\n", " 29117: 'francesco',\n", " 52030: 'complainers',\n", " 52128: 'templarios',\n", " 40838: '272',\n", " 52031: '273',\n", " 52133: 'zaniacs',\n", " 34709: '275',\n", " 27634: 'consenting',\n", " 40839: 'snuggled',\n", " 15495: 'inanimate',\n", " 52033: 'uality',\n", " 11929: 'bronte',\n", " 4013: 'errors',\n", " 3233: 'dialogs',\n", " 52034: \"yomada's\",\n", " 34710: \"madman's\",\n", " 30588: 'dialoge',\n", " 52036: 'usenet',\n", " 40840: 'videodrome',\n", " 26341: \"kid'\",\n", " 52037: 'pawed',\n", " 30572: \"'girlfriend'\",\n", " 52038: \"'pleasure\",\n", " 52039: \"'reloaded'\",\n", " 40842: \"kazakos'\",\n", " 52040: 'rocque',\n", " 52041: 'mailings',\n", " 11930: 'brainwashed',\n", " 16822: 'mcanally',\n", " 52042: \"tom''\",\n", " 25246: 'kurupt',\n", " 21908: 'affiliated',\n", " 52043: 'babaganoosh',\n", " 40843: \"noe's\",\n", " 40844: 'quart',\n", " 362: 'kids',\n", " 5037: 'uplifting',\n", " 7096: 'controversy',\n", " 21909: 'kida',\n", " 23382: 'kidd',\n", " 52044: \"error'\",\n", " 52045: 'neurologist',\n", " 18513: 'spotty',\n", " 30573: 'cobblers',\n", " 9881: 'projection',\n", " 40845: 'fastforwarding',\n", " 52046: 'sters',\n", " 52047: \"eggar's\",\n", " 52048: 'etherything',\n", " 40846: 'gateshead',\n", " 34711: 'airball',\n", " 25247: 'unsinkable',\n", " 7183: 'stern',\n", " 52049: \"cervi's\",\n", " 40847: 'dnd',\n", " 11589: 'dna',\n", " 20601: 'insecurity',\n", " 52050: \"'reboot'\",\n", " 11040: 'trelkovsky',\n", " 52051: 'jaekel',\n", " 52052: 'sidebars',\n", " 52053: \"sforza's\",\n", " 17636: 'distortions',\n", " 52054: 'mutinies',\n", " 30605: 'sermons',\n", " 40849: '7ft',\n", " 52055: 'boobage',\n", " 52056: \"o'bannon's\",\n", " 23383: 'populations',\n", " 52057: 'chulak',\n", " 27636: 'mesmerize',\n", " 52058: 'quinnell',\n", " 10310: 'yahoo',\n", " 52060: 'meteorologist',\n", " 42580: 'beswick',\n", " 15496: 'boorman',\n", " 40850: 'voicework',\n", " 52061: \"ster'\",\n", " 22925: 'blustering',\n", " 52062: 'hj',\n", " 27637: 'intake',\n", " 5624: 'morally',\n", " 40852: 'jumbling',\n", " 52063: 'bowersock',\n", " 52064: \"'porky's'\",\n", " 16824: 'gershon',\n", " 40853: 'ludicrosity',\n", " 52065: 'coprophilia',\n", " 40854: 'expressively',\n", " 19503: \"india's\",\n", " 34713: \"post's\",\n", " 52066: 'wana',\n", " 5286: 'wang',\n", " 30574: 'wand',\n", " 25248: 'wane',\n", " 52324: 'edgeways',\n", " 34714: 'titanium',\n", " 40855: 'pinta',\n", " 181: 'want',\n", " 30575: 'pinto',\n", " 52068: 'whoopdedoodles',\n", " 21911: 'tchaikovsky',\n", " 2106: 'travel',\n", " 52069: \"'victory'\",\n", " 11931: 'copious',\n", " 22436: 'gouge',\n", " 52070: \"chapters'\",\n", " 6705: 'barbra',\n", " 30576: 'uselessness',\n", " 52071: \"wan'\",\n", " 27638: 'assimilated',\n", " 16119: 'petiot',\n", " 52072: 'most\\x85and',\n", " 3933: 'dinosaurs',\n", " 355: 'wrong',\n", " 52073: 'seda',\n", " 52074: 'stollen',\n", " 34715: 'sentencing',\n", " 40856: 'ouroboros',\n", " 40857: 'assimilates',\n", " 40858: 'colorfully',\n", " 27639: 'glenne',\n", " 52075: 'dongen',\n", " 4763: 'subplots',\n", " 52076: 'kiloton',\n", " 23384: 'chandon',\n", " 34716: \"effect'\",\n", " 27640: 'snugly',\n", " 40859: 'kuei',\n", " 9095: 'welcomed',\n", " 30074: 'dishonor',\n", " 52078: 'concurrence',\n", " 23385: 'stoicism',\n", " 14899: \"guys'\",\n", " 52080: \"beroemd'\",\n", " 6706: 'butcher',\n", " 40860: \"melfi's\",\n", " 30626: 'aargh',\n", " 20602: 'playhouse',\n", " 11311: 'wickedly',\n", " 1183: 'fit',\n", " 52081: 'labratory',\n", " 40862: 'lifeline',\n", " 1930: 'screaming',\n", " 4290: 'fix',\n", " 52082: 'cineliterate',\n", " 52083: 'fic',\n", " 52084: 'fia',\n", " 34717: 'fig',\n", " 52085: 'fmvs',\n", " 52086: 'fie',\n", " 52087: 'reentered',\n", " 30577: 'fin',\n", " 52088: 'doctresses',\n", " 52089: 'fil',\n", " 12609: 'zucker',\n", " 31934: 'ached',\n", " 52091: 'counsil',\n", " 52092: 'paterfamilias',\n", " 13888: 'songwriter',\n", " 34718: 'shivam',\n", " 9657: 'hurting',\n", " 302: 'effects',\n", " 52093: 'slauther',\n", " 52094: \"'flame'\",\n", " 52095: 'sommerset',\n", " 52096: 'interwhined',\n", " 27641: 'whacking',\n", " 52097: 'bartok',\n", " 8778: 'barton',\n", " 21912: 'frewer',\n", " 52098: \"fi'\",\n", " 6195: 'ingrid',\n", " 30578: 'stribor',\n", " 52099: 'approporiately',\n", " 52100: 'wobblyhand',\n", " 52101: 'tantalisingly',\n", " 52102: 'ankylosaurus',\n", " 17637: 'parasites',\n", " 52103: 'childen',\n", " 52104: \"jenkins'\",\n", " 52105: 'metafiction',\n", " 17638: 'golem',\n", " 40863: 'indiscretion',\n", " 23386: \"reeves'\",\n", " 57784: \"inamorata's\",\n", " 52107: 'brittannica',\n", " 7919: 'adapt',\n", " 30579: \"russo's\",\n", " 48249: 'guitarists',\n", " 10556: 'abbott',\n", " 40864: 'abbots',\n", " 17652: 'lanisha',\n", " 40866: 'magickal',\n", " 52108: 'mattter',\n", " 52109: \"'willy\",\n", " 34719: 'pumpkins',\n", " 52110: 'stuntpeople',\n", " 30580: 'estimate',\n", " 40867: 'ugghhh',\n", " 11312: 'gameplay',\n", " 52111: \"wern't\",\n", " 40868: \"n'sync\",\n", " 16120: 'sickeningly',\n", " 40869: 'chiara',\n", " 4014: 'disturbed',\n", " 40870: 'portmanteau',\n", " 52112: 'ineffectively',\n", " 82146: \"duchonvey's\",\n", " 37522: \"nasty'\",\n", " 1288: 'purpose',\n", " 52115: 'lazers',\n", " 28108: 'lightened',\n", " 52116: 'kaliganj',\n", " 52117: 'popularism',\n", " 18514: \"damme's\",\n", " 30581: 'stylistics',\n", " 52118: 'mindgaming',\n", " 46452: 'spoilerish',\n", " 52120: \"'corny'\",\n", " 34721: 'boerner',\n", " 6795: 'olds',\n", " 52121: 'bakelite',\n", " 27642: 'renovated',\n", " 27643: 'forrester',\n", " 52122: \"lumiere's\",\n", " 52027: 'gaskets',\n", " 887: 'needed',\n", " 34722: 'smight',\n", " 1300: 'master',\n", " 25908: \"edie's\",\n", " 40871: 'seeber',\n", " 52123: 'hiya',\n", " 52124: 'fuzziness',\n", " 14900: 'genesis',\n", " 12610: 'rewards',\n", " 30582: 'enthrall',\n", " 40872: \"'about\",\n", " 52125: \"recollection's\",\n", " 11042: 'mutilated',\n", " 52126: 'fatherlands',\n", " 52127: \"fischer's\",\n", " 5402: 'positively',\n", " 34708: '270',\n", " 34723: 'ahmed',\n", " 9839: 'zatoichi',\n", " 13889: 'bannister',\n", " 52130: 'anniversaries',\n", " 30583: \"helm's\",\n", " 52131: \"'work'\",\n", " 34724: 'exclaimed',\n", " 52132: \"'unfunny'\",\n", " 52032: '274',\n", " 547: 'feeling',\n", " 52134: \"wanda's\",\n", " 33269: 'dolan',\n", " 52136: '278',\n", " 52137: 'peacoat',\n", " 40873: 'brawny',\n", " 40874: 'mishra',\n", " 40875: 'worlders',\n", " 52138: 'protags',\n", " 52139: 'skullcap',\n", " 57599: 'dastagir',\n", " 5625: 'affairs',\n", " 7802: 'wholesome',\n", " 52140: 'hymen',\n", " 25249: 'paramedics',\n", " 52141: 'unpersons',\n", " 52142: 'heavyarms',\n", " 52143: 'affaire',\n", " 52144: 'coulisses',\n", " 40876: 'hymer',\n", " 52145: 'kremlin',\n", " 30584: 'shipments',\n", " 52146: 'pixilated',\n", " 30585: \"'00s\",\n", " 18515: 'diminishing',\n", " 1360: 'cinematic',\n", " 14901: 'resonates',\n", " 40877: 'simplify',\n", " 40878: \"nature'\",\n", " 40879: 'temptresses',\n", " 16825: 'reverence',\n", " 19505: 'resonated',\n", " 34725: 'dailey',\n", " 52147: '2\\x85',\n", " 27644: 'treize',\n", " 52148: 'majo',\n", " 21913: 'kiya',\n", " 52149: 'woolnough',\n", " 39800: 'thanatos',\n", " 35734: 'sandoval',\n", " 40882: 'dorama',\n", " 52150: \"o'shaughnessy\",\n", " 4991: 'tech',\n", " 32021: 'fugitives',\n", " 30586: 'teck',\n", " 76128: \"'e'\",\n", " 40884: 'doesn’t',\n", " 52152: 'purged',\n", " 660: 'saying',\n", " 41098: \"martians'\",\n", " 23421: 'norliss',\n", " 27645: 'dickey',\n", " 52155: 'dicker',\n", " 52156: \"'sependipity\",\n", " 8425: 'padded',\n", " 57795: 'ordell',\n", " 40885: \"sturges'\",\n", " 52157: 'independentcritics',\n", " 5748: 'tempted',\n", " 34727: \"atkinson's\",\n", " 25250: 'hounded',\n", " 52158: 'apace',\n", " 15497: 'clicked',\n", " 30587: \"'humor'\",\n", " 17180: \"martino's\",\n", " 52159: \"'supporting\",\n", " 52035: 'warmongering',\n", " 34728: \"zemeckis's\",\n", " 21914: 'lube',\n", " 52160: 'shocky',\n", " 7479: 'plate',\n", " 40886: 'plata',\n", " 40887: 'sturgess',\n", " 40888: \"nerds'\",\n", " 20603: 'plato',\n", " 34729: 'plath',\n", " 40889: 'platt',\n", " 52162: 'mcnab',\n", " 27646: 'clumsiness',\n", " 3902: 'altogether',\n", " 42587: 'massacring',\n", " 52163: 'bicenntinial',\n", " 40890: 'skaal',\n", " 14363: 'droning',\n", " 8779: 'lds',\n", " 21915: 'jaguar',\n", " 34730: \"cale's\",\n", " 1780: 'nicely',\n", " 4591: 'mummy',\n", " 18516: \"lot's\",\n", " 10089: 'patch',\n", " 50205: 'kerkhof',\n", " 52164: \"leader's\",\n", " 27647: \"'movie\",\n", " 52165: 'uncomfirmed',\n", " 40891: 'heirloom',\n", " 47363: 'wrangle',\n", " 52166: 'emotion\\x85',\n", " 52167: \"'stargate'\",\n", " 40892: 'pinoy',\n", " 40893: 'conchatta',\n", " 41131: 'broeke',\n", " 40894: 'advisedly',\n", " 17639: \"barker's\",\n", " 52169: 'descours',\n", " 775: 'lots',\n", " 9262: 'lotr',\n", " 9882: 'irs',\n", " 52170: 'lott',\n", " 40895: 'xvi',\n", " 34731: 'irk',\n", " 52171: 'irl',\n", " 6890: 'ira',\n", " 21916: 'belzer',\n", " 52172: 'irc',\n", " 27648: 'ire',\n", " 40896: 'requisites',\n", " 7696: 'discipline',\n", " 52964: 'lyoko',\n", " 11313: 'extend',\n", " 876: 'nature',\n", " 52173: \"'dickie'\",\n", " 40897: 'optimist',\n", " 30589: 'lapping',\n", " 3903: 'superficial',\n", " 52174: 'vestment',\n", " 2826: 'extent',\n", " 52175: 'tendons',\n", " 52176: \"heller's\",\n", " 52177: 'quagmires',\n", " 52178: 'miyako',\n", " 20604: 'moocow',\n", " 52179: \"coles'\",\n", " 40898: 'lookit',\n", " 52180: 'ravenously',\n", " 40899: 'levitating',\n", " 52181: 'perfunctorily',\n", " 30590: 'lookin',\n", " 40901: \"lot'\",\n", " 52182: 'lookie',\n", " 34873: 'fearlessly',\n", " 52184: 'libyan',\n", " 40902: 'fondles',\n", " 35717: 'gopher',\n", " 40904: 'wearying',\n", " 52185: \"nz's\",\n", " 27649: 'minuses',\n", " 52186: 'puposelessly',\n", " 52187: 'shandling',\n", " 31271: 'decapitates',\n", " 11932: 'humming',\n", " 40905: \"'nother\",\n", " 21917: 'smackdown',\n", " 30591: 'underdone',\n", " 40906: 'frf',\n", " 52188: 'triviality',\n", " 25251: 'fro',\n", " 8780: 'bothers',\n", " 52189: \"'kensington\",\n", " 76: 'much',\n", " 34733: 'muco',\n", " 22618: 'wiseguy',\n", " 27651: \"richie's\",\n", " 40907: 'tonino',\n", " 52190: 'unleavened',\n", " 11590: 'fry',\n", " 40908: \"'tv'\",\n", " 40909: 'toning',\n", " 14364: 'obese',\n", " 30592: 'sensationalized',\n", " 40910: 'spiv',\n", " 6262: 'spit',\n", " 7367: 'arkin',\n", " 21918: 'charleton',\n", " 16826: 'jeon',\n", " 21919: 'boardroom',\n", " 4992: 'doubts',\n", " 3087: 'spin',\n", " 53086: 'hepo',\n", " 27652: 'wildcat',\n", " 10587: 'venoms',\n", " 52194: 'misconstrues',\n", " 18517: 'mesmerising',\n", " 40911: 'misconstrued',\n", " 52195: 'rescinds',\n", " 52196: 'prostrate',\n", " 40912: 'majid',\n", " 16482: 'climbed',\n", " 34734: 'canoeing',\n", " 52198: 'majin',\n", " 57807: 'animie',\n", " 40913: 'sylke',\n", " 14902: 'conditioned',\n", " 40914: 'waddell',\n", " 52199: '3\\x85',\n", " 41191: 'hyperdrive',\n", " 34735: 'conditioner',\n", " 53156: 'bricklayer',\n", " 2579: 'hong',\n", " 52201: 'memoriam',\n", " 30595: 'inventively',\n", " 25252: \"levant's\",\n", " 20641: 'portobello',\n", " 52203: 'remand',\n", " 19507: 'mummified',\n", " 27653: 'honk',\n", " 19508: 'spews',\n", " 40915: 'visitations',\n", " 52204: 'mummifies',\n", " 25253: 'cavanaugh',\n", " 23388: 'zeon',\n", " 40916: \"jungle's\",\n", " 34736: 'viertel',\n", " 27654: 'frenchmen',\n", " 52205: 'torpedoes',\n", " 52206: 'schlessinger',\n", " 34737: 'torpedoed',\n", " 69879: 'blister',\n", " 52207: 'cinefest',\n", " 34738: 'furlough',\n", " 52208: 'mainsequence',\n", " 40917: 'mentors',\n", " 9097: 'academic',\n", " 20605: 'stillness',\n", " 40918: 'academia',\n", " 52209: 'lonelier',\n", " 52210: 'nibby',\n", " 52211: \"losers'\",\n", " 40919: 'cineastes',\n", " 4452: 'corporate',\n", " 40920: 'massaging',\n", " 30596: 'bellow',\n", " 19509: 'absurdities',\n", " 53244: 'expetations',\n", " 40921: 'nyfiken',\n", " 75641: 'mehras',\n", " 52212: 'lasse',\n", " 52213: 'visability',\n", " 33949: 'militarily',\n", " 52214: \"elder'\",\n", " 19026: 'gainsbourg',\n", " 20606: 'hah',\n", " 13423: 'hai',\n", " 34739: 'haj',\n", " 25254: 'hak',\n", " 4314: 'hal',\n", " 4895: 'ham',\n", " 53262: 'duffer',\n", " 52216: 'haa',\n", " 69: 'had',\n", " 11933: 'advancement',\n", " 16828: 'hag',\n", " 25255: \"hand'\",\n", " 13424: 'hay',\n", " 20607: 'mcnamara',\n", " 52217: \"mozart's\",\n", " 30734: 'duffel',\n", " 30597: 'haq',\n", " 13890: 'har',\n", " 47: 'has',\n", " 2404: 'hat',\n", " 40922: 'hav',\n", " 30598: 'haw',\n", " 52218: 'figtings',\n", " 15498: 'elders',\n", " 52219: 'underpanted',\n", " 52220: 'pninson',\n", " 27655: 'unequivocally',\n", " 23676: \"barbara's\",\n", " 52222: \"bello'\",\n", " 13000: 'indicative',\n", " 40923: 'yawnfest',\n", " 52223: 'hexploitation',\n", " 52224: \"loder's\",\n", " 27656: 'sleuthing',\n", " 32625: \"justin's\",\n", " 52225: \"'ball\",\n", " 52226: \"'summer\",\n", " 34938: \"'demons'\",\n", " 52228: \"mormon's\",\n", " 34740: \"laughton's\",\n", " 52229: 'debell',\n", " 39727: 'shipyard',\n", " 30600: 'unabashedly',\n", " 40404: 'disks',\n", " 2293: 'crowd',\n", " 10090: 'crowe',\n", " 56437: \"vancouver's\",\n", " 34741: 'mosques',\n", " 6630: 'crown',\n", " 52230: 'culpas',\n", " 27657: 'crows',\n", " 53347: 'surrell',\n", " 52232: 'flowless',\n", " 52233: 'sheirk',\n", " 40926: \"'three\",\n", " 52234: \"peterson'\",\n", " 52235: 'ooverall',\n", " 40927: 'perchance',\n", " 1324: 'bottom',\n", " 53366: 'chabert',\n", " 52236: 'sneha',\n", " 13891: 'inhuman',\n", " 52237: 'ichii',\n", " 52238: 'ursla',\n", " 30601: 'completly',\n", " 40928: 'moviedom',\n", " 52239: 'raddick',\n", " 51998: 'brundage',\n", " 40929: 'brigades',\n", " 1184: 'starring',\n", " 52240: \"'goal'\",\n", " 52241: 'caskets',\n", " 52242: 'willcock',\n", " 52243: \"threesome's\",\n", " 52244: \"mosque'\",\n", " 52245: \"cover's\",\n", " 17640: 'spaceships',\n", " 40930: 'anomalous',\n", " 27658: 'ptsd',\n", " 52246: 'shirdan',\n", " 21965: 'obscenity',\n", " 30602: 'lemmings',\n", " 30603: 'duccio',\n", " 52247: \"levene's\",\n", " 52248: \"'gorby'\",\n", " 25258: \"teenager's\",\n", " 5343: 'marshall',\n", " 9098: 'honeymoon',\n", " 3234: 'shoots',\n", " 12261: 'despised',\n", " 52249: 'okabasho',\n", " 8292: 'fabric',\n", " 18518: 'cannavale',\n", " 3540: 'raped',\n", " 52250: \"tutt's\",\n", " 17641: 'grasping',\n", " 18519: 'despises',\n", " 40931: \"thief's\",\n", " 8929: 'rapes',\n", " 52251: 'raper',\n", " 27659: \"eyre'\",\n", " 52252: 'walchek',\n", " 23389: \"elmo's\",\n", " 40932: 'perfumes',\n", " 21921: 'spurting',\n", " 52253: \"exposition'\\x85\",\n", " 52254: 'denoting',\n", " 34743: 'thesaurus',\n", " 40933: \"shoot'\",\n", " 49762: 'bonejack',\n", " 52256: 'simpsonian',\n", " 30604: 'hebetude',\n", " 34744: \"hallow's\",\n", " 52257: 'desperation\\x85',\n", " 34745: 'incinerator',\n", " 10311: 'congratulations',\n", " 52258: 'humbled',\n", " 5927: \"else's\",\n", " 40848: 'trelkovski',\n", " 52259: \"rape'\",\n", " 59389: \"'chapters'\",\n", " 52260: '1600s',\n", " 7256: 'martian',\n", " 25259: 'nicest',\n", " 52262: 'eyred',\n", " 9460: 'passenger',\n", " 6044: 'disgrace',\n", " 52263: 'moderne',\n", " 5123: 'barrymore',\n", " 52264: 'yankovich',\n", " 40934: 'moderns',\n", " 52265: 'studliest',\n", " 52266: 'bedsheet',\n", " 14903: 'decapitation',\n", " 52267: 'slurring',\n", " 52268: \"'nunsploitation'\",\n", " 34746: \"'character'\",\n", " 9883: 'cambodia',\n", " 52269: 'rebelious',\n", " 27660: 'pasadena',\n", " 40935: 'crowne',\n", " 52270: \"'bedchamber\",\n", " 52271: 'conjectural',\n", " 52272: 'appologize',\n", " 52273: 'halfassing',\n", " 57819: 'paycheque',\n", " 20609: 'palms',\n", " 52274: \"'islands\",\n", " 40936: 'hawked',\n", " 21922: 'palme',\n", " 40937: 'conservatively',\n", " 64010: 'larp',\n", " 5561: 'palma',\n", " 21923: 'smelling',\n", " 13001: 'aragorn',\n", " 52275: 'hawker',\n", " 52276: 'hawkes',\n", " 3978: 'explosions',\n", " 8062: 'loren',\n", " 52277: \"pyle's\",\n", " 6707: 'shootout',\n", " 18520: \"mike's\",\n", " 52278: \"driscoll's\",\n", " 40938: 'cogsworth',\n", " 52279: \"britian's\",\n", " 34747: 'childs',\n", " 52280: \"portrait's\",\n", " 3629: 'chain',\n", " 2500: 'whoever',\n", " 52281: 'puttered',\n", " 52282: 'childe',\n", " 52283: 'maywether',\n", " 3039: 'chair',\n", " 52284: \"rance's\",\n", " 34748: 'machu',\n", " 4520: 'ballet',\n", " 34749: 'grapples',\n", " 76155: 'summerize',\n", " 30606: 'freelance',\n", " 52286: \"andrea's\",\n", " 52287: '\\x91very',\n", " 45882: 'coolidge',\n", " 18521: 'mache',\n", " 52288: 'balled',\n", " 40940: 'grappled',\n", " 18522: 'macha',\n", " 21924: 'underlining',\n", " 5626: 'macho',\n", " 19510: 'oversight',\n", " 25260: 'machi',\n", " 11314: 'verbally',\n", " 21925: 'tenacious',\n", " 40941: 'windshields',\n", " 18560: 'paychecks',\n", " 3399: 'jerk',\n", " 11934: \"good'\",\n", " 34751: 'prancer',\n", " 21926: 'prances',\n", " 52289: 'olympus',\n", " 21927: 'lark',\n", " 10788: 'embark',\n", " 7368: 'gloomy',\n", " 52290: 'jehaan',\n", " 52291: 'turaqui',\n", " 20610: \"child'\",\n", " 2897: 'locked',\n", " 52292: 'pranced',\n", " 2591: 'exact',\n", " 52293: 'unattuned',\n", " 786: 'minute',\n", " 16121: 'skewed',\n", " 40943: 'hodgins',\n", " 34752: 'skewer',\n", " 52294: 'think\\x85',\n", " 38768: 'rosenstein',\n", " 52295: 'helmit',\n", " 34753: 'wrestlemanias',\n", " 16829: 'hindered',\n", " 30607: \"martha's\",\n", " 52296: 'cheree',\n", " 52297: \"pluckin'\",\n", " 40944: 'ogles',\n", " 11935: 'heavyweight',\n", " 82193: 'aada',\n", " 11315: 'chopping',\n", " 61537: 'strongboy',\n", " 41345: 'hegemonic',\n", " 40945: 'adorns',\n", " 41349: 'xxth',\n", " 34754: 'nobuhiro',\n", " 52301: 'capitães',\n", " 52302: 'kavogianni',\n", " 13425: 'antwerp',\n", " 6541: 'celebrated',\n", " 52303: 'roarke',\n", " 40946: 'baggins',\n", " 31273: 'cheeseburgers',\n", " 52304: 'matras',\n", " 52305: \"nineties'\",\n", " 52306: \"'craig'\",\n", " 13002: 'celebrates',\n", " 3386: 'unintentionally',\n", " 14365: 'drafted',\n", " 52307: 'climby',\n", " 52308: '303',\n", " 18523: 'oldies',\n", " 9099: 'climbs',\n", " 9658: 'honour',\n", " 34755: 'plucking',\n", " 30077: '305',\n", " 5517: 'address',\n", " 40947: 'menjou',\n", " 42595: \"'freak'\",\n", " 19511: 'dwindling',\n", " 9461: 'benson',\n", " 52310: 'white’s',\n", " 40948: 'shamelessness',\n", " 21928: 'impacted',\n", " 52311: 'upatz',\n", " 3843: 'cusack',\n", " 37570: \"flavia's\",\n", " 52312: 'effette',\n", " 34756: 'influx',\n", " 52313: 'boooooooo',\n", " 52314: 'dimitrova',\n", " 13426: 'houseman',\n", " 25262: 'bigas',\n", " 52315: 'boylen',\n", " 52316: 'phillipenes',\n", " 40949: 'fakery',\n", " 27661: \"grandpa's\",\n", " 27662: 'darnell',\n", " 19512: 'undergone',\n", " 52318: 'handbags',\n", " 21929: 'perished',\n", " 37781: 'pooped',\n", " 27663: 'vigour',\n", " 3630: 'opposed',\n", " 52319: 'etude',\n", " 11802: \"caine's\",\n", " 52320: 'doozers',\n", " 34757: 'photojournals',\n", " 52321: 'perishes',\n", " 34758: 'constrains',\n", " 40951: 'migenes',\n", " 30608: 'consoled',\n", " 16830: 'alastair',\n", " 52322: 'wvs',\n", " 52323: 'ooooooh',\n", " 34759: 'approving',\n", " 40952: 'consoles',\n", " 52067: 'disparagement',\n", " 52325: 'futureistic',\n", " 52326: 'rebounding',\n", " 52327: \"'date\",\n", " 52328: 'gregoire',\n", " 21930: 'rutherford',\n", " 34760: 'americanised',\n", " 82199: 'novikov',\n", " 1045: 'following',\n", " 34761: 'munroe',\n", " 52329: \"morita'\",\n", " 52330: 'christenssen',\n", " 23109: 'oatmeal',\n", " 25263: 'fossey',\n", " 40953: 'livered',\n", " 13003: 'listens',\n", " 76167: \"'marci\",\n", " 52333: \"otis's\",\n", " 23390: 'thanking',\n", " 16022: 'maude',\n", " 34762: 'extensions',\n", " 52335: 'ameteurish',\n", " 52336: \"commender's\",\n", " 27664: 'agricultural',\n", " 4521: 'convincingly',\n", " 17642: 'fueled',\n", " 54017: 'mahattan',\n", " 40955: \"paris's\",\n", " 52339: 'vulkan',\n", " 52340: 'stapes',\n", " 52341: 'odysessy',\n", " 12262: 'harmon',\n", " 4255: 'surfing',\n", " 23497: 'halloran',\n", " 49583: 'unbelieveably',\n", " 52342: \"'offed'\",\n", " 30610: 'quadrant',\n", " 19513: 'inhabiting',\n", " 34763: 'nebbish',\n", " 40956: 'forebears',\n", " 34764: 'skirmish',\n", " 52343: 'ocassionally',\n", " 52344: \"'resist\",\n", " 21931: 'impactful',\n", " 52345: 'spicier',\n", " 40957: 'touristy',\n", " 52346: \"'football'\",\n", " 40958: 'webpage',\n", " 52348: 'exurbia',\n", " 52349: 'jucier',\n", " 14904: 'professors',\n", " 34765: 'structuring',\n", " 30611: 'jig',\n", " 40959: 'overlord',\n", " 25264: 'disconnect',\n", " 82204: 'sniffle',\n", " 40960: 'slimeball',\n", " 40961: 'jia',\n", " 16831: 'milked',\n", " 40962: 'banjoes',\n", " 1240: 'jim',\n", " 52351: 'workforces',\n", " 52352: 'jip',\n", " 52353: 'rotweiller',\n", " 34766: 'mundaneness',\n", " 52354: \"'ninja'\",\n", " 11043: \"dead'\",\n", " 40963: \"cipriani's\",\n", " 20611: 'modestly',\n", " 52355: \"professor'\",\n", " 40964: 'shacked',\n", " 34767: 'bashful',\n", " 23391: 'sorter',\n", " 16123: 'overpowering',\n", " 18524: 'workmanlike',\n", " 27665: 'henpecked',\n", " 18525: 'sorted',\n", " 52357: \"jōb's\",\n", " 52358: \"'always\",\n", " 34768: \"'baptists\",\n", " 52359: 'dreamcatchers',\n", " 52360: \"'silence'\",\n", " 21932: 'hickory',\n", " 52361: 'fun\\x97yet',\n", " 52362: 'breakumentary',\n", " 15499: 'didn',\n", " 52363: 'didi',\n", " 52364: 'pealing',\n", " 40965: 'dispite',\n", " 25265: \"italy's\",\n", " 21933: 'instability',\n", " 6542: 'quarter',\n", " 12611: 'quartet',\n", " 52365: 'padmé',\n", " 52366: \"'bleedmedry\",\n", " 52367: 'pahalniuk',\n", " 52368: 'honduras',\n", " 10789: 'bursting',\n", " 41468: \"pablo's\",\n", " 52370: 'irremediably',\n", " 40966: 'presages',\n", " 57835: 'bowlegged',\n", " 65186: 'dalip',\n", " 6263: 'entering',\n", " 76175: 'newsradio',\n", " 54153: 'presaged',\n", " 27666: \"giallo's\",\n", " 40967: 'bouyant',\n", " 52371: 'amerterish',\n", " 18526: 'rajni',\n", " 30613: 'leeves',\n", " 34770: 'macauley',\n", " 615: 'seriously',\n", " 52372: 'sugercoma',\n", " 52373: 'grimstead',\n", " 52374: \"'fairy'\",\n", " 30614: 'zenda',\n", " 52375: \"'twins'\",\n", " 17643: 'realisation',\n", " 27667: 'highsmith',\n", " 7820: 'raunchy',\n", " 40968: 'incentives',\n", " 52377: 'flatson',\n", " 35100: 'snooker',\n", " 16832: 'crazies',\n", " 14905: 'crazier',\n", " 7097: 'grandma',\n", " 52378: 'napunsaktha',\n", " 30615: 'workmanship',\n", " 52379: 'reisner',\n", " 61309: \"sanford's\",\n", " 52380: '\\x91doña',\n", " 6111: 'modest',\n", " 19156: \"everything's\",\n", " 40969: 'hamer',\n", " 52382: \"couldn't'\",\n", " 13004: 'quibble',\n", " 52383: 'socking',\n", " 21934: 'tingler',\n", " 52384: 'gutman',\n", " 40970: 'lachlan',\n", " 52385: 'tableaus',\n", " 52386: 'headbanger',\n", " 2850: 'spoken',\n", " 34771: 'cerebrally',\n", " 23493: \"'road\",\n", " 21935: 'tableaux',\n", " 40971: \"proust's\",\n", " 40972: 'periodical',\n", " 52388: \"shoveller's\",\n", " 25266: 'tamara',\n", " 17644: 'affords',\n", " 3252: 'concert',\n", " 87958: \"yara's\",\n", " 52389: 'someome',\n", " 8427: 'lingering',\n", " 41514: \"abraham's\",\n", " 34772: 'beesley',\n", " 34773: 'cherbourg',\n", " 28627: 'kagan',\n", " 9100: 'snatch',\n", " 9263: \"miyazaki's\",\n", " 25267: 'absorbs',\n", " 40973: \"koltai's\",\n", " 64030: 'tingled',\n", " 19514: 'crossroads',\n", " 16124: 'rehab',\n", " 52392: 'falworth',\n", " 52393: 'sequals',\n", " ...}" ] }, "execution_count": 14, "metadata": {}, "output_type": "execute_result" } ], "source": [ "word_index = imdb.get_word_index()\n", "word_index = { k: (v + 3) for k, v in word_index.items() }\n", "word_index[\"PAD\"] = 0\n", "word_index[\"START\"] = 1\n", "word_index[\"UNK\"] = 2\n", "index_word = { v: k for k, v in word_index.items() }\n", "index_word" ] }, { "cell_type": "markdown", "id": "a04781ef", "metadata": {}, "source": [ "#### Вывод первого отзыва из тренировочной выборки\n", "\n", "Отзывы содержат только идентификаторы для экономии памяти" ] }, { "cell_type": "code", "execution_count": 15, "id": "059670a8", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[2,\n", " 2,\n", " 2,\n", " 2,\n", " 2,\n", " 530,\n", " 973,\n", " 1622,\n", " 1385,\n", " 65,\n", " 458,\n", " 4468,\n", " 66,\n", " 3941,\n", " 2,\n", " 173,\n", " 2,\n", " 256,\n", " 2,\n", " 2,\n", " 100,\n", " 2,\n", " 838,\n", " 112,\n", " 50,\n", " 670,\n", " 2,\n", " 2,\n", " 2,\n", " 480,\n", " 284,\n", " 2,\n", " 150,\n", " 2,\n", " 172,\n", " 112,\n", " 167,\n", " 2,\n", " 336,\n", " 385,\n", " 2,\n", " 2,\n", " 172,\n", " 4536,\n", " 1111,\n", " 2,\n", " 546,\n", " 2,\n", " 2,\n", " 447,\n", " 2,\n", " 192,\n", " 50,\n", " 2,\n", " 2,\n", " 147,\n", " 2025,\n", " 2,\n", " 2,\n", " 2,\n", " 2,\n", " 1920,\n", " 4613,\n", " 469,\n", " 2,\n", " 2,\n", " 71,\n", " 87,\n", " 2,\n", " 2,\n", " 2,\n", " 530,\n", " 2,\n", " 76,\n", " 2,\n", " 2,\n", " 1247,\n", " 2,\n", " 2,\n", " 2,\n", " 515,\n", " 2,\n", " 2,\n", " 2,\n", " 626,\n", " 2,\n", " 2,\n", " 2,\n", " 62,\n", " 386,\n", " 2,\n", " 2,\n", " 316,\n", " 2,\n", " 106,\n", " 2,\n", " 2,\n", " 2223,\n", " 2,\n", " 2,\n", " 480,\n", " 66,\n", " 3785,\n", " 2,\n", " 2,\n", " 130,\n", " 2,\n", " 2,\n", " 2,\n", " 619,\n", " 2,\n", " 2,\n", " 124,\n", " 51,\n", " 2,\n", " 135,\n", " 2,\n", " 2,\n", " 1415,\n", " 2,\n", " 2,\n", " 2,\n", " 2,\n", " 215,\n", " 2,\n", " 77,\n", " 52,\n", " 2,\n", " 2,\n", " 407,\n", " 2,\n", " 82,\n", " 2,\n", " 2,\n", " 2,\n", " 107,\n", " 117,\n", " 2,\n", " 2,\n", " 256,\n", " 2,\n", " 2,\n", " 2,\n", " 3766,\n", " 2,\n", " 723,\n", " 2,\n", " 71,\n", " 2,\n", " 530,\n", " 476,\n", " 2,\n", " 400,\n", " 317,\n", " 2,\n", " 2,\n", " 2,\n", " 2,\n", " 1029,\n", " 2,\n", " 104,\n", " 88,\n", " 2,\n", " 381,\n", " 2,\n", " 297,\n", " 98,\n", " 2,\n", " 2071,\n", " 56,\n", " 2,\n", " 141,\n", " 2,\n", " 194,\n", " 2,\n", " 2,\n", " 2,\n", " 226,\n", " 2,\n", " 2,\n", " 134,\n", " 476,\n", " 2,\n", " 480,\n", " 2,\n", " 144,\n", " 2,\n", " 2,\n", " 2,\n", " 51,\n", " 2,\n", " 2,\n", " 224,\n", " 92,\n", " 2,\n", " 104,\n", " 2,\n", " 226,\n", " 65,\n", " 2,\n", " 2,\n", " 1334,\n", " 88,\n", " 2,\n", " 2,\n", " 283,\n", " 2,\n", " 2,\n", " 4472,\n", " 113,\n", " 103,\n", " 2,\n", " 2,\n", " 2,\n", " 2,\n", " 2,\n", " 178,\n", " 2]" ] }, "execution_count": 15, "metadata": {}, "output_type": "execute_result" } ], "source": [ "X_train[0]" ] }, { "cell_type": "markdown", "id": "d0eca5f5", "metadata": {}, "source": [ "#### Можно заменить идентификаторы на реальные слова с учетом предобработки" ] }, { "cell_type": "code", "execution_count": 16, "id": "4c1912fa", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "\"UNK UNK UNK UNK UNK brilliant casting location scenery story direction everyone's really suited UNK part UNK played UNK UNK could UNK imagine being there robert UNK UNK UNK amazing actor UNK now UNK same being director UNK father came UNK UNK same scottish island UNK myself UNK UNK loved UNK fact there UNK UNK real connection UNK UNK UNK UNK witty remarks throughout UNK UNK were great UNK UNK UNK brilliant UNK much UNK UNK bought UNK UNK UNK soon UNK UNK UNK released UNK UNK UNK would recommend UNK UNK everyone UNK watch UNK UNK fly UNK UNK amazing really cried UNK UNK end UNK UNK UNK sad UNK UNK know what UNK say UNK UNK cry UNK UNK UNK UNK must UNK been good UNK UNK definitely UNK also UNK UNK UNK two little UNK UNK played UNK UNK UNK norman UNK paul UNK were UNK brilliant children UNK often left UNK UNK UNK UNK list UNK think because UNK stars UNK play them UNK grown up UNK such UNK big UNK UNK UNK whole UNK UNK these children UNK amazing UNK should UNK UNK UNK what UNK UNK done don't UNK think UNK whole story UNK UNK lovely because UNK UNK true UNK UNK someone's life after UNK UNK UNK UNK UNK us UNK\"" ] }, "execution_count": 16, "metadata": {}, "output_type": "execute_result" } ], "source": [ "\" \".join(index_word[id] for id in X_train[0])" ] }, { "cell_type": "markdown", "id": "ab903f3e", "metadata": {}, "source": [ "#### Можно вывести изначальный отзыв (если выключить удаление редких слов и стоп-слов)" ] }, { "cell_type": "code", "execution_count": 17, "id": "faf79c8d", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "\"START this film was just brilliant casting location scenery story direction everyone's really suited the part they played and you could just imagine being there robert redford's is an amazing actor and now the same being director norman's father came from the same scottish island as myself so i loved the fact there was a real connection with this film the witty remarks throughout the film were great it was just brilliant so much that i bought the film as soon as it was released for retail and would recommend it to everyone to watch and the fly fishing was amazing really cried at the end it was so sad and you know what they say if you cry at a film it must have been good and this definitely was also congratulations to the two little boy's that played the part's of norman and paul they were just brilliant children are often left out of the praising list i think because the stars that play them all grown up are such a big profile for the whole film but these children are amazing and should be praised for what they have done don't you think the whole story was so lovely because it was true and was someone's life after all that was shared with us all\"" ] }, "execution_count": 17, "metadata": {}, "output_type": "execute_result" } ], "source": [ "(textual_X_train, _), _ = imdb.load_data()\n", "\" \".join(index_word[id] for id in textual_X_train[0])" ] }, { "cell_type": "markdown", "id": "35e4c578", "metadata": {}, "source": [ "#### Приведение отзывов к длине max_length (100)\n", "\n", "padding и truncating - дополнение и обрезка отзывов начинается с начала (учитывается специфика затухания градиента в рекуррентных сетях)" ] }, { "cell_type": "code", "execution_count": 18, "id": "131e125a", "metadata": {}, "outputs": [], "source": [ "from keras.api.preprocessing.sequence import pad_sequences\n", "\n", "X_train = pad_sequences(X_train, maxlen=max_length, padding=\"pre\", truncating=\"pre\", value=0)\n", "X_valid = pad_sequences(X_valid, maxlen=max_length, padding=\"pre\", truncating=\"pre\", value=0)" ] }, { "cell_type": "markdown", "id": "87eac800", "metadata": {}, "source": [ "#### Формирование архитектуры глубокой полносвязанной сети\n", "\n", "Первый слой (Embedding) выполняет векторизацию" ] }, { "cell_type": "code", "execution_count": 19, "id": "6a2e7a0e", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
Model: \"sequential_1\"\n",
       "
\n" ], "text/plain": [ "\u001b[1mModel: \"sequential_1\"\u001b[0m\n" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [ "
┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━┓\n",
       "┃ Layer (type)                     Output Shape                  Param # ┃\n",
       "┡━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━┩\n",
       "│ embedding_1 (Embedding)         │ (None, 100, 64)        │       320,000 │\n",
       "├─────────────────────────────────┼────────────────────────┼───────────────┤\n",
       "│ flatten_1 (Flatten)             │ (None, 6400)           │             0 │\n",
       "├─────────────────────────────────┼────────────────────────┼───────────────┤\n",
       "│ dense_2 (Dense)                 │ (None, 64)             │       409,664 │\n",
       "├─────────────────────────────────┼────────────────────────┼───────────────┤\n",
       "│ dropout_1 (Dropout)             │ (None, 64)             │             0 │\n",
       "├─────────────────────────────────┼────────────────────────┼───────────────┤\n",
       "│ dense_3 (Dense)                 │ (None, 1)              │            65 │\n",
       "└─────────────────────────────────┴────────────────────────┴───────────────┘\n",
       "
\n" ], "text/plain": [ "┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━┓\n", "┃\u001b[1m \u001b[0m\u001b[1mLayer (type) \u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mOutput Shape \u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1m Param #\u001b[0m\u001b[1m \u001b[0m┃\n", "┡━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━┩\n", "│ embedding_1 (\u001b[38;5;33mEmbedding\u001b[0m) │ (\u001b[38;5;45mNone\u001b[0m, \u001b[38;5;34m100\u001b[0m, \u001b[38;5;34m64\u001b[0m) │ \u001b[38;5;34m320,000\u001b[0m │\n", "├─────────────────────────────────┼────────────────────────┼───────────────┤\n", "│ flatten_1 (\u001b[38;5;33mFlatten\u001b[0m) │ (\u001b[38;5;45mNone\u001b[0m, \u001b[38;5;34m6400\u001b[0m) │ \u001b[38;5;34m0\u001b[0m │\n", "├─────────────────────────────────┼────────────────────────┼───────────────┤\n", "│ dense_2 (\u001b[38;5;33mDense\u001b[0m) │ (\u001b[38;5;45mNone\u001b[0m, \u001b[38;5;34m64\u001b[0m) │ \u001b[38;5;34m409,664\u001b[0m │\n", "├─────────────────────────────────┼────────────────────────┼───────────────┤\n", "│ dropout_1 (\u001b[38;5;33mDropout\u001b[0m) │ (\u001b[38;5;45mNone\u001b[0m, \u001b[38;5;34m64\u001b[0m) │ \u001b[38;5;34m0\u001b[0m │\n", "├─────────────────────────────────┼────────────────────────┼───────────────┤\n", "│ dense_3 (\u001b[38;5;33mDense\u001b[0m) │ (\u001b[38;5;45mNone\u001b[0m, \u001b[38;5;34m1\u001b[0m) │ \u001b[38;5;34m65\u001b[0m │\n", "└─────────────────────────────────┴────────────────────────┴───────────────┘\n" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [ "
 Total params: 729,729 (2.78 MB)\n",
       "
\n" ], "text/plain": [ "\u001b[1m Total params: \u001b[0m\u001b[38;5;34m729,729\u001b[0m (2.78 MB)\n" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [ "
 Trainable params: 729,729 (2.78 MB)\n",
       "
\n" ], "text/plain": [ "\u001b[1m Trainable params: \u001b[0m\u001b[38;5;34m729,729\u001b[0m (2.78 MB)\n" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [ "
 Non-trainable params: 0 (0.00 B)\n",
       "
\n" ], "text/plain": [ "\u001b[1m Non-trainable params: \u001b[0m\u001b[38;5;34m0\u001b[0m (0.00 B)\n" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "from keras.api.models import Sequential\n", "from keras.api.layers import Dense, Flatten, Dropout, Embedding, InputLayer\n", "\n", "simple_model = Sequential()\n", "simple_model.add(InputLayer(shape=(max_length,), dtype=\"float32\"))\n", "simple_model.add(Embedding(unique_words, 64))\n", "simple_model.add(Flatten())\n", "simple_model.add(Dense(64, activation=\"relu\"))\n", "simple_model.add(Dropout(0.5))\n", "simple_model.add(Dense(1, activation=\"sigmoid\"))\n", "\n", "simple_model.summary()" ] }, { "cell_type": "markdown", "id": "0ff9c40a", "metadata": {}, "source": [ "#### Обучение модели\n", "\n", "Веса модели сохраняются в каталог tmp после каждой эпохи обучения с помощью callback-параметра\n", "\n", "В дальнейшем веса можно загрузить" ] }, { "cell_type": "code", "execution_count": 20, "id": "52043fc5", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Epoch 1/4\n", "\u001b[1m196/196\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m4s\u001b[0m 19ms/step - accuracy: 0.5753 - loss: 0.6517 - val_accuracy: 0.8346 - val_loss: 0.3689\n", "Epoch 2/4\n", "\u001b[1m196/196\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m3s\u001b[0m 17ms/step - accuracy: 0.8922 - loss: 0.2751 - val_accuracy: 0.8460 - val_loss: 0.3510\n", "Epoch 3/4\n", "\u001b[1m196/196\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m4s\u001b[0m 18ms/step - accuracy: 0.9724 - loss: 0.1080 - val_accuracy: 0.8335 - val_loss: 0.4402\n", "Epoch 4/4\n", "\u001b[1m196/196\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m3s\u001b[0m 18ms/step - accuracy: 0.9974 - loss: 0.0224 - val_accuracy: 0.8337 - val_loss: 0.5407\n" ] }, { "data": { "text/plain": [ "" ] }, "execution_count": 20, "metadata": {}, "output_type": "execute_result" } ], "source": [ "from keras.api.callbacks import ModelCheckpoint\n", "\n", "simple_model.compile(\n", " loss=\"binary_crossentropy\",\n", " optimizer=\"adam\",\n", " metrics=[\"accuracy\"],\n", ")\n", "\n", "simple_model.fit(\n", " X_train,\n", " y_train,\n", " batch_size=128,\n", " epochs=4,\n", " validation_data=(X_valid, y_valid),\n", " callbacks=[ModelCheckpoint(filepath=output_dir + \"/simple_weights.{epoch:02d}.keras\")],\n", ")" ] }, { "cell_type": "markdown", "id": "3c495301", "metadata": {}, "source": [ "#### Загрузка лучшей модели и оценка ее качества\n", "\n", "Качество модели - 84.6 %." ] }, { "cell_type": "code", "execution_count": 21, "id": "73443ddb", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\u001b[1m782/782\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m4s\u001b[0m 5ms/step - accuracy: 0.8436 - loss: 0.3559\n" ] }, { "data": { "text/plain": [ "[0.3510318398475647, 0.8459600210189819]" ] }, "execution_count": 21, "metadata": {}, "output_type": "execute_result" } ], "source": [ "simple_model.load_weights(output_dir + \"/simple_weights.02.keras\")\n", "simple_model.evaluate(X_valid, y_valid)" ] }, { "cell_type": "markdown", "id": "b1157104", "metadata": {}, "source": [ "#### Визуализация распределения вероятностей результатов модели на валидационной выборке" ] }, { "cell_type": "code", "execution_count": 22, "id": "069236c0", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\u001b[1m782/782\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m1s\u001b[0m 2ms/step\n" ] }, { "data": { "image/png": "iVBORw0KGgoAAAANSUhEUgAAAjAAAAGdCAYAAAAMm0nCAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAA9hAAAPYQGoP6dpAAAvmklEQVR4nO3deXQUZb7/8U8WuhOW7gCabjIEiHIFoqgDjNAqzqC5RIxeHXAUzSAj24UJ3iE5l+3KIOICooKgLFdRwpyBQZgjXiUCxiBwlLAYjUaWqAMaZrAbHUwaELLW7w9+KWkBh45ZeOL7dU6d06nnW9XfegTqY6WqO8KyLEsAAAAGiWzqBgAAAMJFgAEAAMYhwAAAAOMQYAAAgHEIMAAAwDgEGAAAYBwCDAAAMA4BBgAAGCe6qRtoKDU1NTp06JDatGmjiIiIpm4HAACcB8uydPToUSUkJCgy8tzXWZptgDl06JASExObug0AAFAHBw8eVMeOHc853mwDTJs2bSSdmgCXy9XE3QCoN1XHpVcSTr0efEiKbtW0/QCoV8FgUImJifZ5/FyabYCp/bWRy+UiwADNSVWU1PL/v3a5CDBAM/Wvbv/gJl4AAGCcsAJMdXW1/vjHPyopKUmxsbG69NJL9cgjj+j0L7S2LEvTp09Xhw4dFBsbq5SUFH366ach+zly5IjS09PlcrkUFxenkSNH6tixYyE1H330kfr376+YmBglJiZqzpw5P+IwAQBAcxJWgHniiSe0ePFiPffcc9q7d6+eeOIJzZkzR88++6xdM2fOHC1YsEBLlizRjh071KpVK6WmpurkyZN2TXp6unbv3q3c3FytW7dOW7du1ZgxY+zxYDCogQMHqnPnziooKNCTTz6pGTNm6Pnnn6+HQwYAAMazwpCWlmaNGDEiZN3gwYOt9PR0y7Isq6amxvJ6vdaTTz5pj5eWllpOp9P6y1/+YlmWZe3Zs8eSZO3atcuuWb9+vRUREWH94x//sCzLshYtWmS1bdvWKi8vt2smT55sdevW7bx7LSsrsyRZZWVl4RwigAtd5THLWqFTS+Wxpu4GQD073/N3WFdgrr32WuXl5emTTz6RJH344Yd65513NGjQIEnSgQMH5Pf7lZKSYm/jdrvVt29f5efnS5Ly8/MVFxenPn362DUpKSmKjIzUjh077JobbrhBDofDrklNTVVxcbG++eabs/ZWXl6uYDAYsgAAgOYprKeQpkyZomAwqO7duysqKkrV1dV67LHHlJ6eLkny+/2SJI/HE7Kdx+Oxx/x+v+Lj40ObiI5Wu3btQmqSkpLO2EftWNu2bc/obdasWXr44YfDORwAAGCosK7ArF69WitWrNDKlSv1/vvva/ny5Xrqqae0fPnyhurvvE2dOlVlZWX2cvDgwaZuCQAANJCwrsBMnDhRU6ZM0dChQyVJPXv21BdffKFZs2Zp+PDh8nq9kqRAIKAOHTrY2wUCAV199dWSJK/Xq8OHD4fst6qqSkeOHLG393q9CgQCITW1P9fWfJ/T6ZTT6QzncAAAgKHCugLz7bffnvG9BFFRUaqpqZEkJSUlyev1Ki8vzx4PBoPasWOHfD6fJMnn86m0tFQFBQV2zaZNm1RTU6O+ffvaNVu3blVlZaVdk5ubq27dup3110cAAOCnJawAc9ttt+mxxx5TTk6OPv/8c61du1Zz587Vr3/9a0mnPjVvwoQJevTRR/Xaa6+pqKhI9913nxISEnTHHXdIknr06KGbb75Zo0eP1s6dO/Xuu+9q/PjxGjp0qBISTn08+L333iuHw6GRI0dq9+7devnllzV//nxlZWXV79EDAAAzhfNoUzAYtP7whz9YnTp1smJiYqxLLrnEevDBB0Med66pqbH++Mc/Wh6Px3I6ndZNN91kFRcXh+znn//8p3XPPfdYrVu3tlwul3X//fdbR48eDan58MMPreuvv95yOp3Wz372M2v27NnhtMpj1EBzxWPUQLN2vufvCMs67WN0m5FgMCi3262ysjK+CwloTqqOS6tbn3p91zG+CwloZs73/M13IQEAAOMQYAAAgHHCeowaAADUvy5Tcpq6hbB9PjutSd+fKzAAAMA4BBgAAGAcAgwAADAOAQYAABiHAAMAAIxDgAEAAMYhwAAAAOMQYAAAgHEIMAAAwDgEGAAAYBwCDAAAMA4BBgAAGIcAAwAAjEOAAQAAxiHAAAAA4xBgAACAcQgwAADAOAQYAABgHAIMAAAwDgEGAAAYhwADAACMQ4ABAADGIcAAAADjEGAAAIBxCDAAAMA4BBgAAGAcAgwAADAOAQYAABiHAAMAAIxDgAEAAMYhwAAAAOOEFWC6dOmiiIiIM5aMjAxJ0smTJ5WRkaH27durdevWGjJkiAKBQMg+SkpKlJaWppYtWyo+Pl4TJ05UVVVVSM3mzZvVq1cvOZ1Ode3aVdnZ2T/uKAEAQLMSVoDZtWuXvvzyS3vJzc2VJP3mN7+RJGVmZur111/XmjVrtGXLFh06dEiDBw+2t6+urlZaWpoqKiq0bds2LV++XNnZ2Zo+fbpdc+DAAaWlpWnAgAEqLCzUhAkTNGrUKG3cuLE+jhcAADQDEZZlWXXdeMKECVq3bp0+/fRTBYNBXXzxxVq5cqXuvPNOSdK+ffvUo0cP5efnq1+/flq/fr1uvfVWHTp0SB6PR5K0ZMkSTZ48WV999ZUcDocmT56snJwcffzxx/b7DB06VKWlpdqwYcN59xYMBuV2u1VWViaXy1XXQwRwoak6Lq1ufer1Xcek6FZN2w9QD7pMyWnqFsL2+ey0Btnv+Z6/63wPTEVFhf785z9rxIgRioiIUEFBgSorK5WSkmLXdO/eXZ06dVJ+fr4kKT8/Xz179rTDiySlpqYqGAxq9+7dds3p+6itqd3HuZSXlysYDIYsAACgeapzgHn11VdVWlqq3/3ud5Ikv98vh8OhuLi4kDqPxyO/32/XnB5easdrx36oJhgM6sSJE+fsZ9asWXK73faSmJhY10MDAAAXuDoHmBdffFGDBg1SQkJCffZTZ1OnTlVZWZm9HDx4sKlbAgAADSS6Lht98cUXeuutt/TKK6/Y67xeryoqKlRaWhpyFSYQCMjr9do1O3fuDNlX7VNKp9d8/8mlQCAgl8ul2NjYc/bkdDrldDrrcjgAAMAwdboCs2zZMsXHxyst7bsbeHr37q0WLVooLy/PXldcXKySkhL5fD5Jks/nU1FRkQ4fPmzX5ObmyuVyKTk52a45fR+1NbX7AAAACDvA1NTUaNmyZRo+fLiio7+7gON2uzVy5EhlZWXp7bffVkFBge6//375fD7169dPkjRw4EAlJydr2LBh+vDDD7Vx40ZNmzZNGRkZ9tWTsWPHav/+/Zo0aZL27dunRYsWafXq1crMzKynQwYAAKYL+1dIb731lkpKSjRixIgzxubNm6fIyEgNGTJE5eXlSk1N1aJFi+zxqKgorVu3TuPGjZPP51OrVq00fPhwzZw5065JSkpSTk6OMjMzNX/+fHXs2FFLly5VampqHQ8RAAA0Nz/qc2AuZHwODNBM8TkwaIb4HJjvNPjnwAAAADQVAgwAADBOnR6j/qnjUh8AAE2LKzAAAMA4BBgAAGAcAgwAADAOAQYAABiHAAMAAIxDgAEAAMYhwAAAAOMQYAAAgHEIMAAAwDgEGAAAYBwCDAAAMA4BBgAAGIcAAwAAjEOAAQAAxiHAAAAA4xBgAACAcQgwAADAOAQYAABgHAIMAAAwDgEGAAAYhwADAACMQ4ABAADGIcAAAADjEGAAAIBxCDAAAMA4BBgAAGAcAgwAADAOAQYAABiHAAMAAIxDgAEAAMYhwAAAAOOEHWD+8Y9/6Le//a3at2+v2NhY9ezZU++99549blmWpk+frg4dOig2NlYpKSn69NNPQ/Zx5MgRpaeny+VyKS4uTiNHjtSxY8dCaj766CP1799fMTExSkxM1Jw5c+p4iAAAoLkJK8B88803uu6669SiRQutX79ee/bs0dNPP622bdvaNXPmzNGCBQu0ZMkS7dixQ61atVJqaqpOnjxp16Snp2v37t3Kzc3VunXrtHXrVo0ZM8YeDwaDGjhwoDp37qyCggI9+eSTmjFjhp5//vl6OGQAAGC66HCKn3jiCSUmJmrZsmX2uqSkJPu1ZVl65plnNG3aNN1+++2SpD/96U/yeDx69dVXNXToUO3du1cbNmzQrl271KdPH0nSs88+q1tuuUVPPfWUEhIStGLFClVUVOill16Sw+HQ5ZdfrsLCQs2dOzck6AAAgJ+msK7AvPbaa+rTp49+85vfKD4+Xj//+c/1wgsv2OMHDhyQ3+9XSkqKvc7tdqtv377Kz8+XJOXn5ysuLs4OL5KUkpKiyMhI7dixw6654YYb5HA47JrU1FQVFxfrm2++OWtv5eXlCgaDIQsAAGiewgow+/fv1+LFi/Vv//Zv2rhxo8aNG6f/+q//0vLlyyVJfr9fkuTxeEK283g89pjf71d8fHzIeHR0tNq1axdSc7Z9nP4e3zdr1iy53W57SUxMDOfQAACAQcIKMDU1NerVq5cef/xx/fznP9eYMWM0evRoLVmypKH6O29Tp05VWVmZvRw8eLCpWwIAAA0krADToUMHJScnh6zr0aOHSkpKJEler1eSFAgEQmoCgYA95vV6dfjw4ZDxqqoqHTlyJKTmbPs4/T2+z+l0yuVyhSwAAKB5CivAXHfddSouLg5Z98knn6hz586STt3Q6/V6lZeXZ48Hg0Ht2LFDPp9PkuTz+VRaWqqCggK7ZtOmTaqpqVHfvn3tmq1bt6qystKuyc3NVbdu3UKeeAIAAD9NYQWYzMxMbd++XY8//rg+++wzrVy5Us8//7wyMjIkSREREZowYYIeffRRvfbaayoqKtJ9992nhIQE3XHHHZJOXbG5+eabNXr0aO3cuVPvvvuuxo8fr6FDhyohIUGSdO+998rhcGjkyJHavXu3Xn75Zc2fP19ZWVn1e/QAAMBIYT1G/Ytf/EJr167V1KlTNXPmTCUlJemZZ55Renq6XTNp0iQdP35cY8aMUWlpqa6//npt2LBBMTExds2KFSs0fvx43XTTTYqMjNSQIUO0YMECe9ztduvNN99URkaGevfurYsuukjTp0/nEWoAACBJirAsy2rqJhpCMBiU2+1WWVlZvd8P02VKTr3urzF8PjutqVsA6kfVcWl161Ov7zomRbdq2n6AesB55Tvne/7mu5AAAIBxCDAAAMA4BBgAAGAcAgwAADAOAQYAABiHAAMAAIxDgAEAAMYhwAAAAOMQYAAAgHEIMAAAwDgEGAAAYBwCDAAAMA4BBgAAGIcAAwAAjEOAAQAAxiHAAAAA4xBgAACAcQgwAADAOAQYAABgHAIMAAAwDgEGAAAYhwADAACMQ4ABAADGIcAAAADjEGAAAIBxCDAAAMA4BBgAAGAcAgwAADAOAQYAABiHAAMAAIxDgAEAAMYhwAAAAOMQYAAAgHEIMAAAwDhhBZgZM2YoIiIiZOnevbs9fvLkSWVkZKh9+/Zq3bq1hgwZokAgELKPkpISpaWlqWXLloqPj9fEiRNVVVUVUrN582b16tVLTqdTXbt2VXZ2dt2PEAAANDthX4G5/PLL9eWXX9rLO++8Y49lZmbq9ddf15o1a7RlyxYdOnRIgwcPtserq6uVlpamiooKbdu2TcuXL1d2dramT59u1xw4cEBpaWkaMGCACgsLNWHCBI0aNUobN278kYcKAACai+iwN4iOltfrPWN9WVmZXnzxRa1cuVI33nijJGnZsmXq0aOHtm/frn79+unNN9/Unj179NZbb8nj8ejqq6/WI488osmTJ2vGjBlyOBxasmSJkpKS9PTTT0uSevTooXfeeUfz5s1TamrqjzxcAADQHIR9BebTTz9VQkKCLrnkEqWnp6ukpESSVFBQoMrKSqWkpNi13bt3V6dOnZSfny9Jys/PV8+ePeXxeOya1NRUBYNB7d692645fR+1NbX7OJfy8nIFg8GQBQAANE9hBZi+ffsqOztbGzZs0OLFi3XgwAH1799fR48eld/vl8PhUFxcXMg2Ho9Hfr9fkuT3+0PCS+147dgP1QSDQZ04ceKcvc2aNUtut9teEhMTwzk0AABgkLB+hTRo0CD79ZVXXqm+ffuqc+fOWr16tWJjY+u9uXBMnTpVWVlZ9s/BYJAQAwBAM/WjHqOOi4vTZZddps8++0xer1cVFRUqLS0NqQkEAvY9M16v94ynkmp//lc1LpfrB0OS0+mUy+UKWQAAQPP0owLMsWPH9Le//U0dOnRQ79691aJFC+Xl5dnjxcXFKikpkc/nkyT5fD4VFRXp8OHDdk1ubq5cLpeSk5PtmtP3UVtTuw8AAICwAsx///d/a8uWLfr888+1bds2/frXv1ZUVJTuueceud1ujRw5UllZWXr77bdVUFCg+++/Xz6fT/369ZMkDRw4UMnJyRo2bJg+/PBDbdy4UdOmTVNGRoacTqckaezYsdq/f78mTZqkffv2adGiRVq9erUyMzPr/+gBAICRwroH5u9//7vuuece/fOf/9TFF1+s66+/Xtu3b9fFF18sSZo3b54iIyM1ZMgQlZeXKzU1VYsWLbK3j4qK0rp16zRu3Dj5fD61atVKw4cP18yZM+2apKQk5eTkKDMzU/Pnz1fHjh21dOlSHqEGAAC2CMuyrKZuoiEEg0G53W6VlZXV+/0wXabk1Ov+GsPns9OaugWgflQdl1a3PvX6rmNSdKum7QeoB5xXvnO+52++CwkAABiHAAMAAIxDgAEAAMYhwAAAAOMQYAAAgHEIMAAAwDgEGAAAYBwCDAAAMA4BBgAAGIcAAwAAjEOAAQAAxiHAAAAA4xBgAACAcQgwAADAOAQYAABgHAIMAAAwDgEGAAAYhwADAACMQ4ABAADGIcAAAADjEGAAAIBxCDAAAMA4BBgAAGAcAgwAADAOAQYAABiHAAMAAIxDgAEAAMYhwAAAAOMQYAAAgHEIMAAAwDgEGAAAYBwCDAAAMA4BBgAAGIcAAwAAjPOjAszs2bMVERGhCRMm2OtOnjypjIwMtW/fXq1bt9aQIUMUCARCtispKVFaWppatmyp+Ph4TZw4UVVVVSE1mzdvVq9eveR0OtW1a1dlZ2f/mFYBAEAzUucAs2vXLv3v//6vrrzyypD1mZmZev3117VmzRpt2bJFhw4d0uDBg+3x6upqpaWlqaKiQtu2bdPy5cuVnZ2t6dOn2zUHDhxQWlqaBgwYoMLCQk2YMEGjRo3Sxo0b69ouAABoRuoUYI4dO6b09HS98MILatu2rb2+rKxML774oubOnasbb7xRvXv31rJly7Rt2zZt375dkvTmm29qz549+vOf/6yrr75agwYN0iOPPKKFCxeqoqJCkrRkyRIlJSXp6aefVo8ePTR+/HjdeeedmjdvXj0cMgAAMF2dAkxGRobS0tKUkpISsr6goECVlZUh67t3765OnTopPz9fkpSfn6+ePXvK4/HYNampqQoGg9q9e7dd8/19p6am2vs4m/LycgWDwZAFAAA0T9HhbrBq1Sq9//772rVr1xljfr9fDodDcXFxIes9Ho/8fr9dc3p4qR2vHfuhmmAwqBMnTig2NvaM9541a5YefvjhcA8HAAAYKKwrMAcPHtQf/vAHrVixQjExMQ3VU51MnTpVZWVl9nLw4MGmbgkAADSQsAJMQUGBDh8+rF69eik6OlrR0dHasmWLFixYoOjoaHk8HlVUVKi0tDRku0AgIK/XK0nyer1nPJVU+/O/qnG5XGe9+iJJTqdTLpcrZAEAAM1TWAHmpptuUlFRkQoLC+2lT58+Sk9Pt1+3aNFCeXl59jbFxcUqKSmRz+eTJPl8PhUVFenw4cN2TW5urlwul5KTk+2a0/dRW1O7DwAA8NMW1j0wbdq00RVXXBGyrlWrVmrfvr29fuTIkcrKylK7du3kcrn0wAMPyOfzqV+/fpKkgQMHKjk5WcOGDdOcOXPk9/s1bdo0ZWRkyOl0SpLGjh2r5557TpMmTdKIESO0adMmrV69Wjk5OfVxzAAAwHBh38T7r8ybN0+RkZEaMmSIysvLlZqaqkWLFtnjUVFRWrduncaNGyefz6dWrVpp+PDhmjlzpl2TlJSknJwcZWZmav78+erYsaOWLl2q1NTU+m4XAAAYKMKyLKupm2gIwWBQbrdbZWVl9X4/TJcp5l0J+nx2WlO3ANSPquPS6tanXt91TIpu1bT9APWA88p3zvf8zXchAQAA4xBgAACAcQgwAADAOAQYAABgHAIMAAAwDgEGAAAYhwADAACMQ4ABAADGIcAAAADjEGAAAIBxCDAAAMA4BBgAAGAcAgwAADAOAQYAABiHAAMAAIxDgAEAAMYhwAAAAOMQYAAAgHEIMAAAwDgEGAAAYBwCDAAAMA4BBgAAGIcAAwAAjEOAAQAAxiHAAAAA4xBgAACAcQgwAADAOAQYAABgHAIMAAAwDgEGAAAYhwADAACMQ4ABAADGIcAAAADjEGAAAIBxCDAAAMA4YQWYxYsX68orr5TL5ZLL5ZLP59P69evt8ZMnTyojI0Pt27dX69atNWTIEAUCgZB9lJSUKC0tTS1btlR8fLwmTpyoqqqqkJrNmzerV69ecjqd6tq1q7Kzs+t+hAAAoNkJK8B07NhRs2fPVkFBgd577z3deOONuv3227V7925JUmZmpl5//XWtWbNGW7Zs0aFDhzR48GB7++rqaqWlpamiokLbtm3T8uXLlZ2drenTp9s1Bw4cUFpamgYMGKDCwkJNmDBBo0aN0saNG+vpkAEAgOkiLMuyfswO2rVrpyeffFJ33nmnLr74Yq1cuVJ33nmnJGnfvn3q0aOH8vPz1a9fP61fv1633nqrDh06JI/HI0lasmSJJk+erK+++koOh0OTJ09WTk6OPv74Y/s9hg4dqtLSUm3YsOG8+woGg3K73SorK5PL5foxh3iGLlNy6nV/jeHz2WlN3QJQP6qOS6tbn3p91zEpulXT9gPUA84r3znf83ed74Gprq7WqlWrdPz4cfl8PhUUFKiyslIpKSl2Tffu3dWpUyfl5+dLkvLz89WzZ087vEhSamqqgsGgfRUnPz8/ZB+1NbX7OJfy8nIFg8GQBQAANE9hB5iioiK1bt1aTqdTY8eO1dq1a5WcnCy/3y+Hw6G4uLiQeo/HI7/fL0ny+/0h4aV2vHbsh2qCwaBOnDhxzr5mzZolt9ttL4mJieEeGgAAMETYAaZbt24qLCzUjh07NG7cOA0fPlx79uxpiN7CMnXqVJWVldnLwYMHm7olAADQQKLD3cDhcKhr166SpN69e2vXrl2aP3++7r77blVUVKi0tDTkKkwgEJDX65Ukeb1e7dy5M2R/tU8pnV7z/SeXAoGAXC6XYmNjz9mX0+mU0+kM93AAAICBfvTnwNTU1Ki8vFy9e/dWixYtlJeXZ48VFxerpKREPp9PkuTz+VRUVKTDhw/bNbm5uXK5XEpOTrZrTt9HbU3tPgAAAMK6AjN16lQNGjRInTp10tGjR7Vy5Upt3rxZGzdulNvt1siRI5WVlaV27drJ5XLpgQcekM/nU79+/SRJAwcOVHJysoYNG6Y5c+bI7/dr2rRpysjIsK+ejB07Vs8995wmTZqkESNGaNOmTVq9erVycsy7QxsAADSMsALM4cOHdd999+nLL7+U2+3WlVdeqY0bN+rf//3fJUnz5s1TZGSkhgwZovLycqWmpmrRokX29lFRUVq3bp3GjRsnn8+nVq1aafjw4Zo5c6Zdk5SUpJycHGVmZmr+/Pnq2LGjli5dqtTU1Ho6ZAAAYLof/TkwFyo+ByYUnwODZoPPgUEzxHnlOw3+OTAAAABNhQADAACMQ4ABAADGIcAAAADjEGAAAIBxCDAAAMA4BBgAAGAcAgwAADAOAQYAABiHAAMAAIxDgAEAAMYhwAAAAOMQYAAAgHEIMAAAwDgEGAAAYBwCDAAAMA4BBgAAGIcAAwAAjEOAAQAAxiHAAAAA40Q3dQMAANSnLlNymroFNAKuwAAAAOMQYAAAgHEIMAAAwDgEGAAAYBwCDAAAMA4BBgAAGIcAAwAAjEOAAQAAxiHAAAAA4xBgAACAcQgwAADAOAQYAABgHAIMAAAwTlgBZtasWfrFL36hNm3aKD4+XnfccYeKi4tDak6ePKmMjAy1b99erVu31pAhQxQIBEJqSkpKlJaWppYtWyo+Pl4TJ05UVVVVSM3mzZvVq1cvOZ1Ode3aVdnZ2XU7QgAA0OyEFWC2bNmijIwMbd++Xbm5uaqsrNTAgQN1/PhxuyYzM1Ovv/661qxZoy1btujQoUMaPHiwPV5dXa20tDRVVFRo27ZtWr58ubKzszV9+nS75sCBA0pLS9OAAQNUWFioCRMmaNSoUdq4cWM9HDIAADBdhGVZVl03/uqrrxQfH68tW7bohhtuUFlZmS6++GKtXLlSd955pyRp37596tGjh/Lz89WvXz+tX79et956qw4dOiSPxyNJWrJkiSZPnqyvvvpKDodDkydPVk5Ojj7++GP7vYYOHarS0lJt2LDhvHoLBoNyu90qKyuTy+Wq6yGeVZcpOfW6v8bw+ey0pm4BqB9Vx6XVrU+9vuuYFN2qafvBBcfEf6NN1FDnlfM9f/+oe2DKysokSe3atZMkFRQUqLKyUikpKXZN9+7d1alTJ+Xn50uS8vPz1bNnTzu8SFJqaqqCwaB2795t15y+j9qa2n2cTXl5uYLBYMgCAACapzoHmJqaGk2YMEHXXXedrrjiCkmS3++Xw+FQXFxcSK3H45Hf77drTg8vteO1Yz9UEwwGdeLEibP2M2vWLLndbntJTEys66EBAIALXJ0DTEZGhj7++GOtWrWqPvups6lTp6qsrMxeDh482NQtAQCABhJdl43Gjx+vdevWaevWrerYsaO93uv1qqKiQqWlpSFXYQKBgLxer12zc+fOkP3VPqV0es33n1wKBAJyuVyKjY09a09Op1NOp7Muh/OTYOrvhLl3BwBwNmFdgbEsS+PHj9fatWu1adMmJSUlhYz37t1bLVq0UF5enr2uuLhYJSUl8vl8kiSfz6eioiIdPnzYrsnNzZXL5VJycrJdc/o+amtq9wEAAH7awroCk5GRoZUrV+r//u//1KZNG/ueFbfbrdjYWLndbo0cOVJZWVlq166dXC6XHnjgAfl8PvXr10+SNHDgQCUnJ2vYsGGaM2eO/H6/pk2bpoyMDPsKytixY/Xcc89p0qRJGjFihDZt2qTVq1crJ8fMqwgAAKB+hXUFZvHixSorK9OvfvUrdejQwV5efvllu2bevHm69dZbNWTIEN1www3yer165ZVX7PGoqCitW7dOUVFR8vl8+u1vf6v77rtPM2fOtGuSkpKUk5Oj3NxcXXXVVXr66ae1dOlSpaam1sMhAwAA04V1BeZ8PjImJiZGCxcu1MKFC89Z07lzZ73xxhs/uJ9f/epX+uCDD8JpDwAA/ETwXUgAAMA4BBgAAGAcAgwAADAOAQYAABiHAAMAAIxDgAEAAMYhwAAAAOMQYAAAgHEIMAAAwDgEGAAAYBwCDAAAMA4BBgAAGIcAAwAAjEOAAQAAxiHAAAAA4xBgAACAcQgwAADAOAQYAABgHAIMAAAwDgEGAAAYhwADAACMQ4ABAADGIcAAAADjRDd1AwCAC1eXKTlN3QJwVlyBAQAAxiHAAAAA4xBgAACAcQgwAADAOAQYAABgHAIMAAAwDo9R44Jm4iOcn89Oa+oWAKDZ4woMAAAwDgEGAAAYhwADAACME3aA2bp1q2677TYlJCQoIiJCr776asi4ZVmaPn26OnTooNjYWKWkpOjTTz8NqTly5IjS09PlcrkUFxenkSNH6tixYyE1H330kfr376+YmBglJiZqzpw54R8dAABolsIOMMePH9dVV12lhQsXnnV8zpw5WrBggZYsWaIdO3aoVatWSk1N1cmTJ+2a9PR07d69W7m5uVq3bp22bt2qMWPG2OPBYFADBw5U586dVVBQoCeffFIzZszQ888/X4dDBAAAzU3YTyENGjRIgwYNOuuYZVl65plnNG3aNN1+++2SpD/96U/yeDx69dVXNXToUO3du1cbNmzQrl271KdPH0nSs88+q1tuuUVPPfWUEhIStGLFClVUVOill16Sw+HQ5ZdfrsLCQs2dOzck6AAAgJ+mer0H5sCBA/L7/UpJSbHXud1u9e3bV/n5+ZKk/Px8xcXF2eFFklJSUhQZGakdO3bYNTfccIMcDoddk5qaquLiYn3zzTdnfe/y8nIFg8GQBQAANE/1GmD8fr8kyePxhKz3eDz2mN/vV3x8fMh4dHS02rVrF1Jztn2c/h7fN2vWLLndbntJTEz88QcEAAAuSM3mg+ymTp2qrKws++dgMEiIAXBBMfGDGYELVb1egfF6vZKkQCAQsj4QCNhjXq9Xhw8fDhmvqqrSkSNHQmrOto/T3+P7nE6nXC5XyAIAAJqneg0wSUlJ8nq9ysvLs9cFg0Ht2LFDPp9PkuTz+VRaWqqCggK7ZtOmTaqpqVHfvn3tmq1bt6qystKuyc3NVbdu3dS2bdv6bBkAABgo7ABz7NgxFRYWqrCwUNKpG3cLCwtVUlKiiIgITZgwQY8++qhee+01FRUV6b777lNCQoLuuOMOSVKPHj108803a/To0dq5c6feffddjR8/XkOHDlVCQoIk6d5775XD4dDIkSO1e/duvfzyy5o/f37Ir4gAAMBPV9j3wLz33nsaMGCA/XNtqBg+fLiys7M1adIkHT9+XGPGjFFpaamuv/56bdiwQTExMfY2K1as0Pjx43XTTTcpMjJSQ4YM0YIFC+xxt9utN998UxkZGerdu7cuuugiTZ8+nUeoYQQT73PgCygBmCbCsiyrqZtoCMFgUG63W2VlZfV+P4yJJyjghxgVYKqOS6tbS5J6FP1VJ6yYf7EBgIbQUP9unO/5u9k8hQSg7kwK5bERJ7W3Z1N3AaCp8WWOAADAOAQYAABgHAIMAAAwDgEGAAAYhwADAACMQ4ABAADGIcAAAADjEGAAAIBxCDAAAMA4BBgAAGAcAgwAADAOAQYAABiHAAMAAIxDgAEAAMYhwAAAAOMQYAAAgHEIMAAAwDgEGAAAYBwCDAAAMA4BBgAAGIcAAwAAjEOAAQAAxiHAAAAA4xBgAACAcQgwAADAOAQYAABgHAIMAAAwDgEGAAAYhwADAACMQ4ABAADGIcAAAADjEGAAAIBxCDAAAMA4F3SAWbhwobp06aKYmBj17dtXO3fubOqWAADABeCCDTAvv/yysrKy9NBDD+n999/XVVddpdTUVB0+fLipWwMAAE3sgg0wc+fO1ejRo3X//fcrOTlZS5YsUcuWLfXSSy81dWsAAKCJRTd1A2dTUVGhgoICTZ061V4XGRmplJQU5efnn3Wb8vJylZeX2z+XlZVJkoLBYL33V1P+bb3vE8D5qY44qeD//ytYXf6taqyapm0I+IlqiPPr6fu1LOsH6y7IAPP111+rurpaHo8nZL3H49G+ffvOus2sWbP08MMPn7E+MTGxQXoE0HTc9qv7mrAL4KfN/UzD7v/o0aNyu93nHL8gA0xdTJ06VVlZWfbPNTU1OnLkiNq3b6+IiIh6e59gMKjExEQdPHhQLper3vaLMzHXjYN5bhzMc+NgnhtHQ86zZVk6evSoEhISfrDuggwwF110kaKiohQIBELWBwIBeb3es27jdDrldDpD1sXFxTVUi3K5XPzlaCTMdeNgnhsH89w4mOfG0VDz/ENXXmpdkDfxOhwO9e7dW3l5efa6mpoa5eXlyefzNWFnAADgQnBBXoGRpKysLA0fPlx9+vTRNddco2eeeUbHjx/X/fff39StAQCAJnbBBpi7775bX331laZPny6/36+rr75aGzZsOOPG3sbmdDr10EMPnfHrKtQ/5rpxMM+Ng3luHMxz47gQ5jnC+lfPKQEAAFxgLsh7YAAAAH4IAQYAABiHAAMAAIxDgAEAAMYhwJzFwoUL1aVLF8XExKhv377auXPnD9avWbNG3bt3V0xMjHr27Kk33nijkTo1Xzhz/cILL6h///5q27at2rZtq5SUlH/53wanhPtnutaqVasUERGhO+64o2EbbCbCnefS0lJlZGSoQ4cOcjqduuyyy/j34zyEO8/PPPOMunXrptjYWCUmJiozM1MnT55spG7NtHXrVt12221KSEhQRESEXn311X+5zebNm9WrVy85nU517dpV2dnZDdukhRCrVq2yHA6H9dJLL1m7d++2Ro8ebcXFxVmBQOCs9e+++64VFRVlzZkzx9qzZ481bdo0q0WLFlZRUVEjd26ecOf63nvvtRYuXGh98MEH1t69e63f/e53ltvttv7+9783cudmCXeeax04cMD62c9+ZvXv39+6/fbbG6dZg4U7z+Xl5VafPn2sW265xXrnnXesAwcOWJs3b7YKCwsbuXOzhDvPK1assJxOp7VixQrrwIED1saNG60OHTpYmZmZjdy5Wd544w3rwQcftF555RVLkrV27dofrN+/f7/VsmVLKysry9qzZ4/17LPPWlFRUdaGDRsarEcCzPdcc801VkZGhv1zdXW1lZCQYM2aNeus9XfddZeVlpYWsq5v377Wf/7nfzZon81BuHP9fVVVVVabNm2s5cuXN1SLzUJd5rmqqsq69tprraVLl1rDhw8nwJyHcOd58eLF1iWXXGJVVFQ0VovNQrjznJGRYd14440h67KysqzrrruuQftsTs4nwEyaNMm6/PLLQ9bdfffdVmpqaoP1xa+QTlNRUaGCggKlpKTY6yIjI5WSkqL8/PyzbpOfnx9SL0mpqannrMcpdZnr7/v2229VWVmpdu3aNVSbxqvrPM+cOVPx8fEaOXJkY7RpvLrM82uvvSafz6eMjAx5PB5dccUVevzxx1VdXd1YbRunLvN87bXXqqCgwP410/79+/XGG2/olltuaZSefyqa4lx4wX4Sb1P4+uuvVV1dfcan/Xo8Hu3bt++s2/j9/rPW+/3+BuuzOajLXH/f5MmTlZCQcMZfGnynLvP8zjvv6MUXX1RhYWEjdNg81GWe9+/fr02bNik9PV1vvPGGPvvsM/3+979XZWWlHnroocZo2zh1med7771XX3/9ta6//npZlqWqqiqNHTtW//M//9MYLf9knOtcGAwGdeLECcXGxtb7e3IFBkaaPXu2Vq1apbVr1yomJqap22k2jh49qmHDhumFF17QRRdd1NTtNGs1NTWKj4/X888/r969e+vuu+/Wgw8+qCVLljR1a83K5s2b9fjjj2vRokV6//339corrygnJ0ePPPJIU7eGH4krMKe56KKLFBUVpUAgELI+EAjI6/WedRuv1xtWPU6py1zXeuqppzR79my99dZbuvLKKxuyTeOFO89/+9vf9Pnnn+u2226z19XU1EiSoqOjVVxcrEsvvbRhmzZQXf48d+jQQS1atFBUVJS9rkePHvL7/aqoqJDD4WjQnk1Ul3n+4x//qGHDhmnUqFGSpJ49e+r48eMaM2aMHnzwQUVG8v/x9eFc50KXy9UgV18krsCEcDgc6t27t/Ly8ux1NTU1ysvLk8/nO+s2Pp8vpF6ScnNzz1mPU+oy15I0Z84cPfLII9qwYYP69OnTGK0aLdx57t69u4qKilRYWGgv//Ef/6EBAwaosLBQiYmJjdm+Mery5/m6667TZ599ZgdESfrkk0/UoUMHwss51GWev/322zNCSm1otPgqwHrTJOfCBrs92FCrVq2ynE6nlZ2dbe3Zs8caM2aMFRcXZ/n9fsuyLGvYsGHWlClT7Pp3333Xio6Otp566ilr79691kMPPcRj1Ocp3LmePXu25XA4rL/+9a/Wl19+aS9Hjx5tqkMwQrjz/H08hXR+wp3nkpISq02bNtb48eOt4uJia926dVZ8fLz16KOPNtUhGCHceX7ooYesNm3aWH/5y1+s/fv3W2+++aZ16aWXWnfddVdTHYIRjh49an3wwQfWBx98YEmy5s6da33wwQfWF198YVmWZU2ZMsUaNmyYXV/7GPXEiROtvXv3WgsXLuQx6qbw7LPPWp06dbIcDod1zTXXWNu3b7fHfvnLX1rDhw8PqV+9erV12WWXWQ6Hw7r88sutnJycRu7YXOHMdefOnS1JZywPPfRQ4zdumHD/TJ+OAHP+wp3nbdu2WX379rWcTqd1ySWXWI899phVVVXVyF2bJ5x5rqystGbMmGFdeumlVkxMjJWYmGj9/ve/t7755pvGb9wgb7/99ln/va2d2+HDh1u//OUvz9jm6quvthwOh3XJJZdYy5Yta9AeIyyLa2gAAMAs3AMDAACMQ4ABAADGIcAAAADjEGAAAIBxCDAAAMA4BBgAAGAcAgwAADAOAQYAABiHAAMAAIxDgAEAAMYhwAAAAOMQYAAAgHH+H/d199tGubfiAAAAAElFTkSuQmCC", "text/plain": [ "
" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "import matplotlib.pyplot as plt\n", "\n", "plt.hist(simple_model.predict(X_valid))\n", "_ = plt.axvline(x=0.5, color=\"orange\")" ] } ], "metadata": { "kernelspec": { "display_name": ".venv (3.12.10)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.12.10" } }, "nbformat": 4, "nbformat_minor": 5 }