69 KiB
69 KiB
In [12]:
import os
os.environ["KERAS_BACKEND"] = "torch"
import keras
print(keras.__version__)
3.9.2
In [13]:
from keras.api.datasets import imdb
import os
unique_words = 5000
max_length = 100
output_dir = "tmp"
if not os.path.exists(output_dir):
os.makedirs(output_dir)
(X_train, y_train), (X_valid, y_valid) = imdb.load_data(num_words=unique_words, skip_top=50)
In [14]:
word_index = imdb.get_word_index()
word_index = { k: (v + 3) for k, v in word_index.items() }
word_index["PAD"] = 0
word_index["START"] = 1
word_index["UNK"] = 2
index_word = { v: k for k, v in word_index.items() }
index_word
Out[14]:
{34704: 'fawn', 52009: 'tsukino', 52010: 'nunnery', 16819: 'sonja', 63954: 'vani', 1411: 'woods', 16118: 'spiders', 2348: 'hanging', 2292: 'woody', 52011: 'trawling', 52012: "hold's", 11310: 'comically', 40833: 'localized', 30571: 'disobeying', 52013: "'royale", 40834: "harpo's", 52014: 'canet', 19316: 'aileen', 52015: 'acurately', 52016: "diplomat's", 25245: 'rickman', 6749: 'arranged', 52017: 'rumbustious', 52018: 'familiarness', 52019: "spider'", 68807: 'hahahah', 52020: "wood'", 40836: 'transvestism', 34705: "hangin'", 2341: 'bringing', 40837: 'seamier', 34706: 'wooded', 52021: 'bravora', 16820: 'grueling', 1639: 'wooden', 16821: 'wednesday', 52022: "'prix", 34707: 'altagracia', 52023: 'circuitry', 11588: 'crotch', 57769: 'busybody', 52024: "tart'n'tangy", 14132: 'burgade', 52026: 'thrace', 11041: "tom's", 52028: 'snuggles', 29117: 'francesco', 52030: 'complainers', 52128: 'templarios', 40838: '272', 52031: '273', 52133: 'zaniacs', 34709: '275', 27634: 'consenting', 40839: 'snuggled', 15495: 'inanimate', 52033: 'uality', 11929: 'bronte', 4013: 'errors', 3233: 'dialogs', 52034: "yomada's", 34710: "madman's", 30588: 'dialoge', 52036: 'usenet', 40840: 'videodrome', 26341: "kid'", 52037: 'pawed', 30572: "'girlfriend'", 52038: "'pleasure", 52039: "'reloaded'", 40842: "kazakos'", 52040: 'rocque', 52041: 'mailings', 11930: 'brainwashed', 16822: 'mcanally', 52042: "tom''", 25246: 'kurupt', 21908: 'affiliated', 52043: 'babaganoosh', 40843: "noe's", 40844: 'quart', 362: 'kids', 5037: 'uplifting', 7096: 'controversy', 21909: 'kida', 23382: 'kidd', 52044: "error'", 52045: 'neurologist', 18513: 'spotty', 30573: 'cobblers', 9881: 'projection', 40845: 'fastforwarding', 52046: 'sters', 52047: "eggar's", 52048: 'etherything', 40846: 'gateshead', 34711: 'airball', 25247: 'unsinkable', 7183: 'stern', 52049: "cervi's", 40847: 'dnd', 11589: 'dna', 20601: 'insecurity', 52050: "'reboot'", 11040: 'trelkovsky', 52051: 'jaekel', 52052: 'sidebars', 52053: "sforza's", 17636: 'distortions', 52054: 'mutinies', 30605: 'sermons', 40849: '7ft', 52055: 'boobage', 52056: "o'bannon's", 23383: 'populations', 52057: 'chulak', 27636: 'mesmerize', 52058: 'quinnell', 10310: 'yahoo', 52060: 'meteorologist', 42580: 'beswick', 15496: 'boorman', 40850: 'voicework', 52061: "ster'", 22925: 'blustering', 52062: 'hj', 27637: 'intake', 5624: 'morally', 40852: 'jumbling', 52063: 'bowersock', 52064: "'porky's'", 16824: 'gershon', 40853: 'ludicrosity', 52065: 'coprophilia', 40854: 'expressively', 19503: "india's", 34713: "post's", 52066: 'wana', 5286: 'wang', 30574: 'wand', 25248: 'wane', 52324: 'edgeways', 34714: 'titanium', 40855: 'pinta', 181: 'want', 30575: 'pinto', 52068: 'whoopdedoodles', 21911: 'tchaikovsky', 2106: 'travel', 52069: "'victory'", 11931: 'copious', 22436: 'gouge', 52070: "chapters'", 6705: 'barbra', 30576: 'uselessness', 52071: "wan'", 27638: 'assimilated', 16119: 'petiot', 52072: 'most\x85and', 3933: 'dinosaurs', 355: 'wrong', 52073: 'seda', 52074: 'stollen', 34715: 'sentencing', 40856: 'ouroboros', 40857: 'assimilates', 40858: 'colorfully', 27639: 'glenne', 52075: 'dongen', 4763: 'subplots', 52076: 'kiloton', 23384: 'chandon', 34716: "effect'", 27640: 'snugly', 40859: 'kuei', 9095: 'welcomed', 30074: 'dishonor', 52078: 'concurrence', 23385: 'stoicism', 14899: "guys'", 52080: "beroemd'", 6706: 'butcher', 40860: "melfi's", 30626: 'aargh', 20602: 'playhouse', 11311: 'wickedly', 1183: 'fit', 52081: 'labratory', 40862: 'lifeline', 1930: 'screaming', 4290: 'fix', 52082: 'cineliterate', 52083: 'fic', 52084: 'fia', 34717: 'fig', 52085: 'fmvs', 52086: 'fie', 52087: 'reentered', 30577: 'fin', 52088: 'doctresses', 52089: 'fil', 12609: 'zucker', 31934: 'ached', 52091: 'counsil', 52092: 'paterfamilias', 13888: 'songwriter', 34718: 'shivam', 9657: 'hurting', 302: 'effects', 52093: 'slauther', 52094: "'flame'", 52095: 'sommerset', 52096: 'interwhined', 27641: 'whacking', 52097: 'bartok', 8778: 'barton', 21912: 'frewer', 52098: "fi'", 6195: 'ingrid', 30578: 'stribor', 52099: 'approporiately', 52100: 'wobblyhand', 52101: 'tantalisingly', 52102: 'ankylosaurus', 17637: 'parasites', 52103: 'childen', 52104: "jenkins'", 52105: 'metafiction', 17638: 'golem', 40863: 'indiscretion', 23386: "reeves'", 57784: "inamorata's", 52107: 'brittannica', 7919: 'adapt', 30579: "russo's", 48249: 'guitarists', 10556: 'abbott', 40864: 'abbots', 17652: 'lanisha', 40866: 'magickal', 52108: 'mattter', 52109: "'willy", 34719: 'pumpkins', 52110: 'stuntpeople', 30580: 'estimate', 40867: 'ugghhh', 11312: 'gameplay', 52111: "wern't", 40868: "n'sync", 16120: 'sickeningly', 40869: 'chiara', 4014: 'disturbed', 40870: 'portmanteau', 52112: 'ineffectively', 82146: "duchonvey's", 37522: "nasty'", 1288: 'purpose', 52115: 'lazers', 28108: 'lightened', 52116: 'kaliganj', 52117: 'popularism', 18514: "damme's", 30581: 'stylistics', 52118: 'mindgaming', 46452: 'spoilerish', 52120: "'corny'", 34721: 'boerner', 6795: 'olds', 52121: 'bakelite', 27642: 'renovated', 27643: 'forrester', 52122: "lumiere's", 52027: 'gaskets', 887: 'needed', 34722: 'smight', 1300: 'master', 25908: "edie's", 40871: 'seeber', 52123: 'hiya', 52124: 'fuzziness', 14900: 'genesis', 12610: 'rewards', 30582: 'enthrall', 40872: "'about", 52125: "recollection's", 11042: 'mutilated', 52126: 'fatherlands', 52127: "fischer's", 5402: 'positively', 34708: '270', 34723: 'ahmed', 9839: 'zatoichi', 13889: 'bannister', 52130: 'anniversaries', 30583: "helm's", 52131: "'work'", 34724: 'exclaimed', 52132: "'unfunny'", 52032: '274', 547: 'feeling', 52134: "wanda's", 33269: 'dolan', 52136: '278', 52137: 'peacoat', 40873: 'brawny', 40874: 'mishra', 40875: 'worlders', 52138: 'protags', 52139: 'skullcap', 57599: 'dastagir', 5625: 'affairs', 7802: 'wholesome', 52140: 'hymen', 25249: 'paramedics', 52141: 'unpersons', 52142: 'heavyarms', 52143: 'affaire', 52144: 'coulisses', 40876: 'hymer', 52145: 'kremlin', 30584: 'shipments', 52146: 'pixilated', 30585: "'00s", 18515: 'diminishing', 1360: 'cinematic', 14901: 'resonates', 40877: 'simplify', 40878: "nature'", 40879: 'temptresses', 16825: 'reverence', 19505: 'resonated', 34725: 'dailey', 52147: '2\x85', 27644: 'treize', 52148: 'majo', 21913: 'kiya', 52149: 'woolnough', 39800: 'thanatos', 35734: 'sandoval', 40882: 'dorama', 52150: "o'shaughnessy", 4991: 'tech', 32021: 'fugitives', 30586: 'teck', 76128: "'e'", 40884: 'doesn’t', 52152: 'purged', 660: 'saying', 41098: "martians'", 23421: 'norliss', 27645: 'dickey', 52155: 'dicker', 52156: "'sependipity", 8425: 'padded', 57795: 'ordell', 40885: "sturges'", 52157: 'independentcritics', 5748: 'tempted', 34727: "atkinson's", 25250: 'hounded', 52158: 'apace', 15497: 'clicked', 30587: "'humor'", 17180: "martino's", 52159: "'supporting", 52035: 'warmongering', 34728: "zemeckis's", 21914: 'lube', 52160: 'shocky', 7479: 'plate', 40886: 'plata', 40887: 'sturgess', 40888: "nerds'", 20603: 'plato', 34729: 'plath', 40889: 'platt', 52162: 'mcnab', 27646: 'clumsiness', 3902: 'altogether', 42587: 'massacring', 52163: 'bicenntinial', 40890: 'skaal', 14363: 'droning', 8779: 'lds', 21915: 'jaguar', 34730: "cale's", 1780: 'nicely', 4591: 'mummy', 18516: "lot's", 10089: 'patch', 50205: 'kerkhof', 52164: "leader's", 27647: "'movie", 52165: 'uncomfirmed', 40891: 'heirloom', 47363: 'wrangle', 52166: 'emotion\x85', 52167: "'stargate'", 40892: 'pinoy', 40893: 'conchatta', 41131: 'broeke', 40894: 'advisedly', 17639: "barker's", 52169: 'descours', 775: 'lots', 9262: 'lotr', 9882: 'irs', 52170: 'lott', 40895: 'xvi', 34731: 'irk', 52171: 'irl', 6890: 'ira', 21916: 'belzer', 52172: 'irc', 27648: 'ire', 40896: 'requisites', 7696: 'discipline', 52964: 'lyoko', 11313: 'extend', 876: 'nature', 52173: "'dickie'", 40897: 'optimist', 30589: 'lapping', 3903: 'superficial', 52174: 'vestment', 2826: 'extent', 52175: 'tendons', 52176: "heller's", 52177: 'quagmires', 52178: 'miyako', 20604: 'moocow', 52179: "coles'", 40898: 'lookit', 52180: 'ravenously', 40899: 'levitating', 52181: 'perfunctorily', 30590: 'lookin', 40901: "lot'", 52182: 'lookie', 34873: 'fearlessly', 52184: 'libyan', 40902: 'fondles', 35717: 'gopher', 40904: 'wearying', 52185: "nz's", 27649: 'minuses', 52186: 'puposelessly', 52187: 'shandling', 31271: 'decapitates', 11932: 'humming', 40905: "'nother", 21917: 'smackdown', 30591: 'underdone', 40906: 'frf', 52188: 'triviality', 25251: 'fro', 8780: 'bothers', 52189: "'kensington", 76: 'much', 34733: 'muco', 22618: 'wiseguy', 27651: "richie's", 40907: 'tonino', 52190: 'unleavened', 11590: 'fry', 40908: "'tv'", 40909: 'toning', 14364: 'obese', 30592: 'sensationalized', 40910: 'spiv', 6262: 'spit', 7367: 'arkin', 21918: 'charleton', 16826: 'jeon', 21919: 'boardroom', 4992: 'doubts', 3087: 'spin', 53086: 'hepo', 27652: 'wildcat', 10587: 'venoms', 52194: 'misconstrues', 18517: 'mesmerising', 40911: 'misconstrued', 52195: 'rescinds', 52196: 'prostrate', 40912: 'majid', 16482: 'climbed', 34734: 'canoeing', 52198: 'majin', 57807: 'animie', 40913: 'sylke', 14902: 'conditioned', 40914: 'waddell', 52199: '3\x85', 41191: 'hyperdrive', 34735: 'conditioner', 53156: 'bricklayer', 2579: 'hong', 52201: 'memoriam', 30595: 'inventively', 25252: "levant's", 20641: 'portobello', 52203: 'remand', 19507: 'mummified', 27653: 'honk', 19508: 'spews', 40915: 'visitations', 52204: 'mummifies', 25253: 'cavanaugh', 23388: 'zeon', 40916: "jungle's", 34736: 'viertel', 27654: 'frenchmen', 52205: 'torpedoes', 52206: 'schlessinger', 34737: 'torpedoed', 69879: 'blister', 52207: 'cinefest', 34738: 'furlough', 52208: 'mainsequence', 40917: 'mentors', 9097: 'academic', 20605: 'stillness', 40918: 'academia', 52209: 'lonelier', 52210: 'nibby', 52211: "losers'", 40919: 'cineastes', 4452: 'corporate', 40920: 'massaging', 30596: 'bellow', 19509: 'absurdities', 53244: 'expetations', 40921: 'nyfiken', 75641: 'mehras', 52212: 'lasse', 52213: 'visability', 33949: 'militarily', 52214: "elder'", 19026: 'gainsbourg', 20606: 'hah', 13423: 'hai', 34739: 'haj', 25254: 'hak', 4314: 'hal', 4895: 'ham', 53262: 'duffer', 52216: 'haa', 69: 'had', 11933: 'advancement', 16828: 'hag', 25255: "hand'", 13424: 'hay', 20607: 'mcnamara', 52217: "mozart's", 30734: 'duffel', 30597: 'haq', 13890: 'har', 47: 'has', 2404: 'hat', 40922: 'hav', 30598: 'haw', 52218: 'figtings', 15498: 'elders', 52219: 'underpanted', 52220: 'pninson', 27655: 'unequivocally', 23676: "barbara's", 52222: "bello'", 13000: 'indicative', 40923: 'yawnfest', 52223: 'hexploitation', 52224: "loder's", 27656: 'sleuthing', 32625: "justin's", 52225: "'ball", 52226: "'summer", 34938: "'demons'", 52228: "mormon's", 34740: "laughton's", 52229: 'debell', 39727: 'shipyard', 30600: 'unabashedly', 40404: 'disks', 2293: 'crowd', 10090: 'crowe', 56437: "vancouver's", 34741: 'mosques', 6630: 'crown', 52230: 'culpas', 27657: 'crows', 53347: 'surrell', 52232: 'flowless', 52233: 'sheirk', 40926: "'three", 52234: "peterson'", 52235: 'ooverall', 40927: 'perchance', 1324: 'bottom', 53366: 'chabert', 52236: 'sneha', 13891: 'inhuman', 52237: 'ichii', 52238: 'ursla', 30601: 'completly', 40928: 'moviedom', 52239: 'raddick', 51998: 'brundage', 40929: 'brigades', 1184: 'starring', 52240: "'goal'", 52241: 'caskets', 52242: 'willcock', 52243: "threesome's", 52244: "mosque'", 52245: "cover's", 17640: 'spaceships', 40930: 'anomalous', 27658: 'ptsd', 52246: 'shirdan', 21965: 'obscenity', 30602: 'lemmings', 30603: 'duccio', 52247: "levene's", 52248: "'gorby'", 25258: "teenager's", 5343: 'marshall', 9098: 'honeymoon', 3234: 'shoots', 12261: 'despised', 52249: 'okabasho', 8292: 'fabric', 18518: 'cannavale', 3540: 'raped', 52250: "tutt's", 17641: 'grasping', 18519: 'despises', 40931: "thief's", 8929: 'rapes', 52251: 'raper', 27659: "eyre'", 52252: 'walchek', 23389: "elmo's", 40932: 'perfumes', 21921: 'spurting', 52253: "exposition'\x85", 52254: 'denoting', 34743: 'thesaurus', 40933: "shoot'", 49762: 'bonejack', 52256: 'simpsonian', 30604: 'hebetude', 34744: "hallow's", 52257: 'desperation\x85', 34745: 'incinerator', 10311: 'congratulations', 52258: 'humbled', 5927: "else's", 40848: 'trelkovski', 52259: "rape'", 59389: "'chapters'", 52260: '1600s', 7256: 'martian', 25259: 'nicest', 52262: 'eyred', 9460: 'passenger', 6044: 'disgrace', 52263: 'moderne', 5123: 'barrymore', 52264: 'yankovich', 40934: 'moderns', 52265: 'studliest', 52266: 'bedsheet', 14903: 'decapitation', 52267: 'slurring', 52268: "'nunsploitation'", 34746: "'character'", 9883: 'cambodia', 52269: 'rebelious', 27660: 'pasadena', 40935: 'crowne', 52270: "'bedchamber", 52271: 'conjectural', 52272: 'appologize', 52273: 'halfassing', 57819: 'paycheque', 20609: 'palms', 52274: "'islands", 40936: 'hawked', 21922: 'palme', 40937: 'conservatively', 64010: 'larp', 5561: 'palma', 21923: 'smelling', 13001: 'aragorn', 52275: 'hawker', 52276: 'hawkes', 3978: 'explosions', 8062: 'loren', 52277: "pyle's", 6707: 'shootout', 18520: "mike's", 52278: "driscoll's", 40938: 'cogsworth', 52279: "britian's", 34747: 'childs', 52280: "portrait's", 3629: 'chain', 2500: 'whoever', 52281: 'puttered', 52282: 'childe', 52283: 'maywether', 3039: 'chair', 52284: "rance's", 34748: 'machu', 4520: 'ballet', 34749: 'grapples', 76155: 'summerize', 30606: 'freelance', 52286: "andrea's", 52287: '\x91very', 45882: 'coolidge', 18521: 'mache', 52288: 'balled', 40940: 'grappled', 18522: 'macha', 21924: 'underlining', 5626: 'macho', 19510: 'oversight', 25260: 'machi', 11314: 'verbally', 21925: 'tenacious', 40941: 'windshields', 18560: 'paychecks', 3399: 'jerk', 11934: "good'", 34751: 'prancer', 21926: 'prances', 52289: 'olympus', 21927: 'lark', 10788: 'embark', 7368: 'gloomy', 52290: 'jehaan', 52291: 'turaqui', 20610: "child'", 2897: 'locked', 52292: 'pranced', 2591: 'exact', 52293: 'unattuned', 786: 'minute', 16121: 'skewed', 40943: 'hodgins', 34752: 'skewer', 52294: 'think\x85', 38768: 'rosenstein', 52295: 'helmit', 34753: 'wrestlemanias', 16829: 'hindered', 30607: "martha's", 52296: 'cheree', 52297: "pluckin'", 40944: 'ogles', 11935: 'heavyweight', 82193: 'aada', 11315: 'chopping', 61537: 'strongboy', 41345: 'hegemonic', 40945: 'adorns', 41349: 'xxth', 34754: 'nobuhiro', 52301: 'capitães', 52302: 'kavogianni', 13425: 'antwerp', 6541: 'celebrated', 52303: 'roarke', 40946: 'baggins', 31273: 'cheeseburgers', 52304: 'matras', 52305: "nineties'", 52306: "'craig'", 13002: 'celebrates', 3386: 'unintentionally', 14365: 'drafted', 52307: 'climby', 52308: '303', 18523: 'oldies', 9099: 'climbs', 9658: 'honour', 34755: 'plucking', 30077: '305', 5517: 'address', 40947: 'menjou', 42595: "'freak'", 19511: 'dwindling', 9461: 'benson', 52310: 'white’s', 40948: 'shamelessness', 21928: 'impacted', 52311: 'upatz', 3843: 'cusack', 37570: "flavia's", 52312: 'effette', 34756: 'influx', 52313: 'boooooooo', 52314: 'dimitrova', 13426: 'houseman', 25262: 'bigas', 52315: 'boylen', 52316: 'phillipenes', 40949: 'fakery', 27661: "grandpa's", 27662: 'darnell', 19512: 'undergone', 52318: 'handbags', 21929: 'perished', 37781: 'pooped', 27663: 'vigour', 3630: 'opposed', 52319: 'etude', 11802: "caine's", 52320: 'doozers', 34757: 'photojournals', 52321: 'perishes', 34758: 'constrains', 40951: 'migenes', 30608: 'consoled', 16830: 'alastair', 52322: 'wvs', 52323: 'ooooooh', 34759: 'approving', 40952: 'consoles', 52067: 'disparagement', 52325: 'futureistic', 52326: 'rebounding', 52327: "'date", 52328: 'gregoire', 21930: 'rutherford', 34760: 'americanised', 82199: 'novikov', 1045: 'following', 34761: 'munroe', 52329: "morita'", 52330: 'christenssen', 23109: 'oatmeal', 25263: 'fossey', 40953: 'livered', 13003: 'listens', 76167: "'marci", 52333: "otis's", 23390: 'thanking', 16022: 'maude', 34762: 'extensions', 52335: 'ameteurish', 52336: "commender's", 27664: 'agricultural', 4521: 'convincingly', 17642: 'fueled', 54017: 'mahattan', 40955: "paris's", 52339: 'vulkan', 52340: 'stapes', 52341: 'odysessy', 12262: 'harmon', 4255: 'surfing', 23497: 'halloran', 49583: 'unbelieveably', 52342: "'offed'", 30610: 'quadrant', 19513: 'inhabiting', 34763: 'nebbish', 40956: 'forebears', 34764: 'skirmish', 52343: 'ocassionally', 52344: "'resist", 21931: 'impactful', 52345: 'spicier', 40957: 'touristy', 52346: "'football'", 40958: 'webpage', 52348: 'exurbia', 52349: 'jucier', 14904: 'professors', 34765: 'structuring', 30611: 'jig', 40959: 'overlord', 25264: 'disconnect', 82204: 'sniffle', 40960: 'slimeball', 40961: 'jia', 16831: 'milked', 40962: 'banjoes', 1240: 'jim', 52351: 'workforces', 52352: 'jip', 52353: 'rotweiller', 34766: 'mundaneness', 52354: "'ninja'", 11043: "dead'", 40963: "cipriani's", 20611: 'modestly', 52355: "professor'", 40964: 'shacked', 34767: 'bashful', 23391: 'sorter', 16123: 'overpowering', 18524: 'workmanlike', 27665: 'henpecked', 18525: 'sorted', 52357: "jōb's", 52358: "'always", 34768: "'baptists", 52359: 'dreamcatchers', 52360: "'silence'", 21932: 'hickory', 52361: 'fun\x97yet', 52362: 'breakumentary', 15499: 'didn', 52363: 'didi', 52364: 'pealing', 40965: 'dispite', 25265: "italy's", 21933: 'instability', 6542: 'quarter', 12611: 'quartet', 52365: 'padmé', 52366: "'bleedmedry", 52367: 'pahalniuk', 52368: 'honduras', 10789: 'bursting', 41468: "pablo's", 52370: 'irremediably', 40966: 'presages', 57835: 'bowlegged', 65186: 'dalip', 6263: 'entering', 76175: 'newsradio', 54153: 'presaged', 27666: "giallo's", 40967: 'bouyant', 52371: 'amerterish', 18526: 'rajni', 30613: 'leeves', 34770: 'macauley', 615: 'seriously', 52372: 'sugercoma', 52373: 'grimstead', 52374: "'fairy'", 30614: 'zenda', 52375: "'twins'", 17643: 'realisation', 27667: 'highsmith', 7820: 'raunchy', 40968: 'incentives', 52377: 'flatson', 35100: 'snooker', 16832: 'crazies', 14905: 'crazier', 7097: 'grandma', 52378: 'napunsaktha', 30615: 'workmanship', 52379: 'reisner', 61309: "sanford's", 52380: '\x91doña', 6111: 'modest', 19156: "everything's", 40969: 'hamer', 52382: "couldn't'", 13004: 'quibble', 52383: 'socking', 21934: 'tingler', 52384: 'gutman', 40970: 'lachlan', 52385: 'tableaus', 52386: 'headbanger', 2850: 'spoken', 34771: 'cerebrally', 23493: "'road", 21935: 'tableaux', 40971: "proust's", 40972: 'periodical', 52388: "shoveller's", 25266: 'tamara', 17644: 'affords', 3252: 'concert', 87958: "yara's", 52389: 'someome', 8427: 'lingering', 41514: "abraham's", 34772: 'beesley', 34773: 'cherbourg', 28627: 'kagan', 9100: 'snatch', 9263: "miyazaki's", 25267: 'absorbs', 40973: "koltai's", 64030: 'tingled', 19514: 'crossroads', 16124: 'rehab', 52392: 'falworth', 52393: 'sequals', ...}
In [15]:
X_train[0]
Out[15]:
[2, 2, 2, 2, 2, 530, 973, 1622, 1385, 65, 458, 4468, 66, 3941, 2, 173, 2, 256, 2, 2, 100, 2, 838, 112, 50, 670, 2, 2, 2, 480, 284, 2, 150, 2, 172, 112, 167, 2, 336, 385, 2, 2, 172, 4536, 1111, 2, 546, 2, 2, 447, 2, 192, 50, 2, 2, 147, 2025, 2, 2, 2, 2, 1920, 4613, 469, 2, 2, 71, 87, 2, 2, 2, 530, 2, 76, 2, 2, 1247, 2, 2, 2, 515, 2, 2, 2, 626, 2, 2, 2, 62, 386, 2, 2, 316, 2, 106, 2, 2, 2223, 2, 2, 480, 66, 3785, 2, 2, 130, 2, 2, 2, 619, 2, 2, 124, 51, 2, 135, 2, 2, 1415, 2, 2, 2, 2, 215, 2, 77, 52, 2, 2, 407, 2, 82, 2, 2, 2, 107, 117, 2, 2, 256, 2, 2, 2, 3766, 2, 723, 2, 71, 2, 530, 476, 2, 400, 317, 2, 2, 2, 2, 1029, 2, 104, 88, 2, 381, 2, 297, 98, 2, 2071, 56, 2, 141, 2, 194, 2, 2, 2, 226, 2, 2, 134, 476, 2, 480, 2, 144, 2, 2, 2, 51, 2, 2, 224, 92, 2, 104, 2, 226, 65, 2, 2, 1334, 88, 2, 2, 283, 2, 2, 4472, 113, 103, 2, 2, 2, 2, 2, 178, 2]
In [16]:
" ".join(index_word[id] for id in X_train[0])
Out[16]:
"UNK UNK UNK UNK UNK brilliant casting location scenery story direction everyone's really suited UNK part UNK played UNK UNK could UNK imagine being there robert UNK UNK UNK amazing actor UNK now UNK same being director UNK father came UNK UNK same scottish island UNK myself UNK UNK loved UNK fact there UNK UNK real connection UNK UNK UNK UNK witty remarks throughout UNK UNK were great UNK UNK UNK brilliant UNK much UNK UNK bought UNK UNK UNK soon UNK UNK UNK released UNK UNK UNK would recommend UNK UNK everyone UNK watch UNK UNK fly UNK UNK amazing really cried UNK UNK end UNK UNK UNK sad UNK UNK know what UNK say UNK UNK cry UNK UNK UNK UNK must UNK been good UNK UNK definitely UNK also UNK UNK UNK two little UNK UNK played UNK UNK UNK norman UNK paul UNK were UNK brilliant children UNK often left UNK UNK UNK UNK list UNK think because UNK stars UNK play them UNK grown up UNK such UNK big UNK UNK UNK whole UNK UNK these children UNK amazing UNK should UNK UNK UNK what UNK UNK done don't UNK think UNK whole story UNK UNK lovely because UNK UNK true UNK UNK someone's life after UNK UNK UNK UNK UNK us UNK"
In [17]:
(textual_X_train, _), _ = imdb.load_data()
" ".join(index_word[id] for id in textual_X_train[0])
Out[17]:
"START this film was just brilliant casting location scenery story direction everyone's really suited the part they played and you could just imagine being there robert redford's is an amazing actor and now the same being director norman's father came from the same scottish island as myself so i loved the fact there was a real connection with this film the witty remarks throughout the film were great it was just brilliant so much that i bought the film as soon as it was released for retail and would recommend it to everyone to watch and the fly fishing was amazing really cried at the end it was so sad and you know what they say if you cry at a film it must have been good and this definitely was also congratulations to the two little boy's that played the part's of norman and paul they were just brilliant children are often left out of the praising list i think because the stars that play them all grown up are such a big profile for the whole film but these children are amazing and should be praised for what they have done don't you think the whole story was so lovely because it was true and was someone's life after all that was shared with us all"
In [18]:
from keras.api.preprocessing.sequence import pad_sequences
X_train = pad_sequences(X_train, maxlen=max_length, padding="pre", truncating="pre", value=0)
X_valid = pad_sequences(X_valid, maxlen=max_length, padding="pre", truncating="pre", value=0)
In [19]:
from keras.api.models import Sequential
from keras.api.layers import Dense, Flatten, Dropout, Embedding, InputLayer
simple_model = Sequential()
simple_model.add(InputLayer(shape=(max_length,), dtype="float32"))
simple_model.add(Embedding(unique_words, 64))
simple_model.add(Flatten())
simple_model.add(Dense(64, activation="relu"))
simple_model.add(Dropout(0.5))
simple_model.add(Dense(1, activation="sigmoid"))
simple_model.summary()
Model: "sequential_1"
┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━┓ ┃ Layer (type) ┃ Output Shape ┃ Param # ┃ ┡━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━┩ │ embedding_1 (Embedding) │ (None, 100, 64) │ 320,000 │ ├─────────────────────────────────┼────────────────────────┼───────────────┤ │ flatten_1 (Flatten) │ (None, 6400) │ 0 │ ├─────────────────────────────────┼────────────────────────┼───────────────┤ │ dense_2 (Dense) │ (None, 64) │ 409,664 │ ├─────────────────────────────────┼────────────────────────┼───────────────┤ │ dropout_1 (Dropout) │ (None, 64) │ 0 │ ├─────────────────────────────────┼────────────────────────┼───────────────┤ │ dense_3 (Dense) │ (None, 1) │ 65 │ └─────────────────────────────────┴────────────────────────┴───────────────┘
Total params: 729,729 (2.78 MB)
Trainable params: 729,729 (2.78 MB)
Non-trainable params: 0 (0.00 B)
In [20]:
from keras.api.callbacks import ModelCheckpoint
simple_model.compile(
loss="binary_crossentropy",
optimizer="adam",
metrics=["accuracy"],
)
simple_model.fit(
X_train,
y_train,
batch_size=128,
epochs=4,
validation_data=(X_valid, y_valid),
callbacks=[ModelCheckpoint(filepath=output_dir + "/simple_weights.{epoch:02d}.keras")],
)
Epoch 1/4 196/196 ━━━━━━━━━━━━━━━━━━━━ 4s 19ms/step - accuracy: 0.5753 - loss: 0.6517 - val_accuracy: 0.8346 - val_loss: 0.3689 Epoch 2/4 196/196 ━━━━━━━━━━━━━━━━━━━━ 3s 17ms/step - accuracy: 0.8922 - loss: 0.2751 - val_accuracy: 0.8460 - val_loss: 0.3510 Epoch 3/4 196/196 ━━━━━━━━━━━━━━━━━━━━ 4s 18ms/step - accuracy: 0.9724 - loss: 0.1080 - val_accuracy: 0.8335 - val_loss: 0.4402 Epoch 4/4 196/196 ━━━━━━━━━━━━━━━━━━━━ 3s 18ms/step - accuracy: 0.9974 - loss: 0.0224 - val_accuracy: 0.8337 - val_loss: 0.5407
Out[20]:
<keras.src.callbacks.history.History at 0x365389a90>
In [21]:
simple_model.load_weights(output_dir + "/simple_weights.02.keras")
simple_model.evaluate(X_valid, y_valid)
782/782 ━━━━━━━━━━━━━━━━━━━━ 4s 5ms/step - accuracy: 0.8436 - loss: 0.3559
Out[21]:
[0.3510318398475647, 0.8459600210189819]
In [22]:
import matplotlib.pyplot as plt
plt.hist(simple_model.predict(X_valid))
_ = plt.axvline(x=0.5, color="orange")
782/782 ━━━━━━━━━━━━━━━━━━━━ 1s 2ms/step