👨🏻‍🏫IT 활동/인공지능교육 - NLP

[NLP] Day 11 - Preprocessing 2

728x90
반응형

Preprocessing 2

In [1]:
import nltk
In [3]:
from nltk.corpus import gutenberg

corpus = gutenberg.open(gutenberg.fileids()[0]).read()
In [4]:
from nltk.tokenize import sent_tokenize, word_tokenize

len(corpus.splitlines()), len(sent_tokenize(corpus)), len(word_tokenize(corpus))

# 문장과 어휘의 개수. 
Out[4]:
(16823, 7493, 191785)
In [5]:
from nltk import Text
# Token을 기반으로 정보를 담기 위한 인스턴스이다. 

tokens = word_tokenize(corpus)
t = Text(tokens)
In [6]:
type(t), t.
Out[6]:
nltk.text.Text
In [8]:
# 191785개의 단어가 t 객체에 들어있다. 
len(t)
Out[8]:
191785
In [9]:
# 하지만 겹치는 단어도 있을 것 !
# Unique한 값을 뽑아야한다. ( = 8406)
len(set(t))
Out[9]:
8406
In [12]:
# 어떤 토큰이 몇 번 나왔는지를 내림차순을 정렬해준다. 
print(t.vocab())
# 자주 등장하는 상위 토큰
# 구두점들이 가장 많은 것. 
t.vocab().most_common()
<FreqDist with 8406 samples and 191785 outcomes>
Out[12]:
[(',', 12016),
 ('.', 6355),
 ('to', 5125),
 ('the', 4844),
 ('and', 4653),
 ('of', 4272),
 ('I', 3177),
 ('--', 3100),
 ('a', 3001),
 ("''", 2452),
 ('was', 2383),
 ('her', 2360),
 (';', 2353),
 ('not', 2242),
 ('in', 2103),
 ('it', 2103),
 ('be', 1965),
 ('she', 1774),
 ('``', 1735),
 ('that', 1729),
 ('you', 1664),
 ('had', 1605),
 ('as', 1387),
 ('he', 1365),
 ('for', 1320),
 ('have', 1301),
 ('is', 1221),
 ('with', 1185),
 ('very', 1151),
 ('but', 1148),
 ('Mr.', 1091),
 ('his', 1084),
 ('!', 1063),
 ('at', 996),
 ('so', 918),
 ("'s", 866),
 ('Emma', 855),
 ('all', 831),
 ('could', 824),
 ('would', 813),
 ('been', 755),
 ('him', 748),
 ('on', 674),
 ('Mrs.', 668),
 ('any', 651),
 ('?', 621),
 ('my', 619),
 ('no', 616),
 ('Miss', 592),
 ('were', 590),
 ('do', 583),
 ('must', 563),
 ('She', 562),
 ('by', 556),
 ('me', 554),
 ('which', 552),
 ('will', 545),
 ('from', 535),
 ('Harriet', 496),
 ('or', 490),
 ('said', 483),
 ('much', 476),
 ('more', 463),
 ('an', 451),
 ('are', 447),
 ('He', 441),
 ('such', 440),
 ('what', 434),
 ('Weston', 429),
 ('them', 429),
 ('there', 419),
 ('this', 418),
 ('than', 415),
 ('am', 410),
 ('one', 408),
 ('can', 407),
 ('It', 400),
 ('every', 398),
 ('thing', 394),
 ('they', 392),
 ('think', 380),
 ('Elton', 378),
 ('if', 375),
 ('Knightley', 373),
 ('should', 366),
 ('The', 357),
 ('being', 356),
 ('little', 354),
 ('never', 347),
 ('your', 337),
 ('did', 335),
 ('know', 335),
 ('only', 326),
 ('might', 321),
 ('when', 312),
 ('Woodhouse', 308),
 ('say', 308),
 ('You', 303),
 ('Jane', 301),
 ('own', 300),
 ('their', 297),
 ('good', 297),
 ('But', 293),
 ('well', 292),
 ('who', 281),
 ('herself', 273),
 ('quite', 269),
 ('now', 268),
 ('time', 268),
 ('how', 263),
 ('great', 262),
 ('we', 260),
 ('too', 253),
 ('some', 247),
 ('about', 246),
 ('most', 243),
 ('has', 243),
 ('before', 239),
 ('always', 235),
 ('nothing', 233),
 ('Fairfax', 232),
 ('man', 230),
 ('thought', 225),
 ('And', 224),
 ('soon', 220),
 ('see', 220),
 ('other', 219),
 ('dear', 217),
 ('again', 216),
 ('may', 213),
 ('Churchill', 213),
 ('shall', 212),
 ('without', 211),
 ('Frank', 207),
 ('first', 205),
 ('out', 205),
 ('sure', 201),
 ('father', 199),
 ('made', 199),
 ('like', 196),
 ('body', 192),
 ('young', 190),
 ('ever', 189),
 ('Oh', 185),
 ('up', 184),
 ('friend', 175),
 ('indeed', 175),
 (':', 174),
 ('two', 171),
 ('though', 169),
 ('into', 163),
 ('better', 163),
 ('Hartfield', 160),
 ('just', 159),
 ('day', 158),
 ('come', 158),
 ('give', 157),
 ('really', 153),
 ('way', 152),
 ('make', 151),
 ('then', 150),
 ('They', 148),
 ('having', 145),
 ('rather', 145),
 ('Bates', 145),
 ('himself', 144),
 ('us', 143),
 ('long', 142),
 ('hope', 142),
 ('seemed', 141),
 ('after', 140),
 ('done', 140),
 ('away', 137),
 ('many', 133),
 ('wish', 133),
 ('upon', 133),
 ('over', 130),
 ('There', 129),
 ('home', 128),
 ('enough', 128),
 ('woman', 127),
 ('here', 127),
 ('go', 127),
 ('mind', 126),
 ('No', 125),
 ('Highbury', 124),
 ('A', 123),
 ("'", 123),
 ('does', 123),
 ('even', 122),
 ('happy', 120),
 ('heard', 120),
 ('came', 119),
 ('last', 119),
 ('its', 118),
 ('going', 118),
 ('moment', 118),
 ('love', 117),
 ('take', 116),
 ('look', 114),
 ('however', 113),
 ('pleasure', 113),
 ('while', 113),
 ('`', 112),
 ('felt', 111),
 ('sort', 111),
 ('If', 110),
 ('still', 109),
 ('How', 108),
 ('This', 108),
 ('My', 108),
 ('saw', 108),
 ('morning', 107),
 ('(', 107),
 (')', 107),
 ('letter', 107),
 ('yet', 106),
 ('few', 106),
 ('poor', 106),
 ('something', 105),
 ('another', 104),
 ('What', 102),
 ('same', 102),
 ('believe', 101),
 ('feelings', 101),
 ('idea', 99),
 ('myself', 99),
 ('hear', 99),
 ('speak', 98),
 ('half', 98),
 ('off', 98),
 ('doubt', 98),
 ('till', 95),
 ('feel', 95),
 ('subject', 95),
 ('evening', 93),
 ('ought', 93),
 ('party', 93),
 ('house', 92),
 ('people', 92),
 ('deal', 92),
 ('found', 92),
 ('word', 92),
 ('looked', 91),
 ('often', 90),
 ('certainly', 90),
 ('want', 89),
 ('We', 89),
 ('our', 89),
 ('John', 89),
 ('Her', 88),
 ('Randalls', 88),
 ('right', 88),
 ('knew', 88),
 ('Smith', 88),
 ('place', 87),
 ('present', 87),
 ('almost', 87),
 ('coming', 87),
 ('those', 87),
 ('Yes', 87),
 ('Well', 86),
 ('visit', 85),
 ('Martin', 85),
 ('possible', 84),
 ('least', 84),
 ('best', 83),
 ('both', 83),
 ('old', 83),
 ('room', 83),
 ('friends', 82),
 ('world', 80),
 ('together', 80),
 ('suppose', 80),
 ('perhaps', 80),
 ('else', 79),
 ('cried', 79),
 ('once', 79),
 ('Perry', 79),
 ('gone', 78),
 ('told', 78),
 ('replied', 78),
 ('given', 77),
 ('family', 76),
 ('hour', 76),
 ('life', 76),
 ('kind', 76),
 ('obliged', 76),
 ('That', 76),
 ('happiness', 75),
 ('manner', 75),
 ('find', 74),
 ('wanted', 74),
 ('tell', 73),
 ('seen', 73),
 ('whole', 73),
 ('hardly', 71),
 ('whom', 71),
 ('between', 71),
 ('able', 71),
 ('where', 71),
 ('lady', 71),
 ('far', 70),
 ('In', 70),
 ('down', 70),
 ('Isabella', 69),
 ('opinion', 69),
 ('side', 69),
 ('Do', 69),
 ('mother', 68),
 ('less', 68),
 ('left', 68),
 ('immediately', 68),
 ('short', 67),
 ('wife', 67),
 ('get', 67),
 ('understand', 67),
 ('person', 67),
 ('talked', 66),
 ('heart', 65),
 ('carriage', 65),
 ('afraid', 65),
 ('back', 65),
 ('ready', 65),
 ('towards', 65),
 ('extremely', 65),
 ('situation', 64),
 ('spirits', 64),
 ('part', 64),
 ('comfort', 64),
 ('let', 64),
 ('perfectly', 64),
 ('began', 64),
 ('general', 64),
 ('answer', 64),
 ('next', 63),
 ('acquaintance', 63),
 ('nor', 63),
 ('pretty', 63),
 ('marry', 63),
 ('since', 63),
 ('set', 63),
 ('three', 62),
 ('sir', 62),
 ('therefore', 62),
 ('looking', 62),
 ('Mr', 62),
 ('through', 61),
 ('change', 61),
 ('put', 61),
 ('whether', 61),
 ('either', 61),
 ('course', 61),
 ('under', 61),
 ('things', 60),
 ('sorry', 60),
 ('talk', 60),
 ('thinking', 60),
 ('Cole', 60),
 ('daughter', 59),
 ('others', 59),
 ('attention', 59),
 ('dare', 59),
 ('assure', 59),
 ('true', 58),
 ('longer', 58),
 ('taken', 58),
 ('His', 57),
 ('mean', 57),
 ('superior', 57),
 ('years', 56),
 ('equal', 56),
 ('children', 56),
 ('feeling', 56),
 ('known', 56),
 ('return', 56),
 ('sense', 56),
 ('To', 56),
 ('state', 56),
 ('CHAPTER', 55),
 ('account', 55),
 ('aunt', 55),
 ('bad', 55),
 ('passed', 54),
 ('brought', 54),
 ('name', 54),
 ('cold', 54),
 ('business', 54),
 ('yourself', 54),
 ('degree', 54),
 ('gave', 54),
 ('nobody', 53),
 ('because', 53),
 ('imagine', 53),
 ('Goddard', 53),
 ('minutes', 53),
 ('manners', 52),
 ('beyond', 52),
 ('walk', 52),
 ('walked', 52),
 ('leave', 52),
 ('talking', 52),
 ('hand', 52),
 ('bear', 51),
 ('among', 51),
 ('exactly', 51),
 ('brother', 51),
 ('When', 51),
 ('saying', 51),
 ('seeing', 51),
 ('point', 51),
 ('Very', 51),
 ('usual', 50),
 ('natural', 50),
 ('directly', 50),
 ('agreeable', 50),
 ('making', 50),
 ('care', 50),
 ('eyes', 50),
 ('read', 50),
 ('So', 50),
 ('Campbell', 50),
 ('affection', 49),
 ('used', 49),
 ('rest', 49),
 ('glad', 49),
 ('days', 49),
 ('satisfied', 49),
 ('Donwell', 49),
 ('wonder', 49),
 ('As', 49),
 ('Such', 49),
 ('words', 49),
 ('these', 48),
 ('means', 48),
 ('attachment', 48),
 ('appeared', 48),
 ('Taylor', 47),
 ('wished', 47),
 ('society', 47),
 ('allow', 47),
 ('took', 47),
 ('voice', 47),
 ('particularly', 46),
 ('each', 46),
 ('girl', 46),
 ('went', 46),
 ('Ah', 46),
 ('believed', 46),
 ('giving', 46),
 ('against', 46),
 ('pleased', 46),
 ('interest', 46),
 ('speaking', 46),
 ('near', 46),
 ('London', 45),
 ('walking', 45),
 ('likely', 45),
 ('air', 45),
 ('remember', 45),
 ('doing', 44),
 ('married', 44),
 ('already', 44),
 ('door', 44),
 ('full', 44),
 ('strong', 44),
 ('called', 44),
 ('smile', 44),
 ('hoped', 43),
 ('regard', 43),
 ('meant', 43),
 ('ask', 43),
 ('stay', 43),
 ('farther', 43),
 ('character', 42),
 ('conversation', 42),
 ('need', 42),
 ('son', 42),
 ('proper', 42),
 ('ladies', 42),
 ('spoke', 42),
 ('real', 41),
 ('dinner', 41),
 ('afterwards', 41),
 ('call', 41),
 ('fine', 41),
 ('different', 41),
 ('respect', 41),
 ('Dixon', 41),
 ('early', 40),
 ('allowed', 40),
 ('power', 40),
 ('beginning', 40),
 ('ill', 40),
 ('particular', 40),
 ('end', 40),
 ('reason', 40),
 ('fond', 39),
 ('settled', 39),
 ('entirely', 39),
 ('meeting', 39),
 ('night', 39),
 ('help', 39),
 ('Not', 39),
 ('question', 39),
 ('creature', 39),
 ('whose', 39),
 ('delightful', 39),
 ('round', 39),
 ('expected', 39),
 ('object', 39),
 ('One', 39),
 ('weather', 39),
 ('sit', 38),
 ('pleasant', 38),
 ('kindness', 38),
 ('returned', 38),
 ('met', 38),
 ('bring', 38),
 ('child', 38),
 ('wrong', 38),
 ('asked', 38),
 ('use', 38),
 ('continued', 38),
 ('letters', 38),
 ('Colonel', 38),
 ('handsome', 37),
 ('sat', 37),
 ('five', 37),
 ('impossible', 37),
 ('thoughts', 37),
 ('head', 37),
 ('sometimes', 37),
 ('Every', 37),
 ('themselves', 37),
 ('convinced', 37),
 ('fancy', 37),
 ('truth', 37),
 ('yes', 37),
 ('seem', 37),
 ('added', 37),
 ('open', 37),
 ('sitting', 37),
 ('fortune', 36),
 ('necessary', 36),
 ('got', 36),
 ('received', 36),
 ('Enscombe', 36),
 ('new', 36),
 ('Now', 36),
 ('surprize', 36),
 ('determined', 36),
 ('probably', 36),
 ('consider', 36),
 ('appear', 36),
 ('temper', 35),
 ('please', 35),
 ('engaged', 35),
 ('supposed', 35),
 ('forward', 35),
 ('nature', 35),
 ('news', 35),
 ('comfortable', 34),
 ('marriage', 34),
 ('health', 34),
 ('amiable', 34),
 ('husband', 34),
 ('circumstance', 34),
 ('At', 34),
 ('common', 34),
 ('company', 34),
 ('taste', 34),
 ('excellent', 33),
 ('large', 33),
 ('perfect', 33),
 ('why', 33),
 ('sent', 33),
 ('acquainted', 33),
 ('wanting', 33),
 ('sister', 32),
 ('meet', 32),
 ('evil', 32),
 ('keep', 32),
 ('hearing', 32),
 ('face', 32),
 ('neither', 32),
 ('meaning', 32),
 ('obliging', 32),
 ('Robert', 32),
 ('absolutely', 32),
 ('dancing', 32),
 ('ago', 31),
 ('pain', 31),
 ('surprized', 31),
 ('except', 31),
 ('resolution', 31),
 ('compliment', 31),
 ('says', 31),
 ('case', 31),
 ('Mrs', 31),
 ('gentleman', 31),
 ('smiling', 31),
 ('praise', 31),
 ('persuaded', 31),
 ('Maple', 31),
 ('Grove', 31),
 ('danger', 30),
 ('difference', 30),
 ('Poor', 30),
 ('begin', 30),
 ('knows', 30),
 ('pass', 30),
 ('matter', 30),
 ('With', 30),
 ('circumstances', 30),
 ('appearance', 30),
 ('interesting', 30),
 ('months', 30),
 ('serious', 30),
 ('difficulty', 30),
 ('spoken', 30),
 ('anxious', 30),
 ('worth', 30),
 ('gratitude', 29),
 ('followed', 29),
 ('papa', 29),
 ('turned', 29),
 ('greatest', 29),
 ('merely', 29),
 ('small', 29),
 ('greater', 29),
 ('second', 29),
 ('week', 29),
 ('beauty', 29),
 ('delighted', 29),
 ('fair', 29),
 ('men', 29),
 ('within', 29),
 ('charming', 29),
 ('ball', 29),
 ('judgment', 28),
 ('easy', 28),
 ('cheerful', 28),
 ('times', 28),
 ('looks', 28),
 ('shew', 28),
 ('warm', 28),
 ('eye', 28),
 ('ten', 28),
 ('turn', 28),
 ('attentions', 28),
 ('forget', 28),
 ('engagement', 28),
 ('clever', 27),
 ('satisfaction', 27),
 ('companion', 27),
 ('loved', 27),
 ('advantage', 27),
 ('none', 27),
 ('year', 27),
 ('Your', 27),
 ('view', 27),
 ('curiosity', 27),
 ('occasion', 27),
 ('happened', 27),
 ('style', 27),
 ('notice', 27),
 ('actually', 27),
 ('decided', 27),
 ('yesterday', 27),
 ('honour', 27),
 ('ashamed', 27),
 ('resolved', 27),
 ('consequence', 26),
 ('friendship', 26),
 ('match', 26),
 ('knowing', 26),
 ('aware', 26),
 ('marrying', 26),
 ('pity', 26),
 ('mentioned', 26),
 ('late', 26),
 ('alone', 26),
 ('spirit', 26),
 ('For', 26),
 ('fortunate', 26),
 ('regret', 26),
 ('tried', 26),
 ('knowledge', 26),
 ('Upon', 26),
 ('taking', 26),
 ('to-morrow', 26),
 ('completely', 26),
 ('Let', 26),
 ('certain', 26),
 ('lived', 25),
 ('attached', 25),
 ('sad', 25),
 ('odd', 25),
 ('pay', 25),
 ('distance', 25),
 ('sensible', 25),
 ('understanding', 25),
 ('to-day', 25),
 ('hours', 25),
 ('favour', 25),
 ('sake', 25),
 ('quiet', 25),
 ('sight', 25),
 ('led', 25),
 ('silent', 25),
 ('plan', 25),
 ('worse', 25),
 ('excuse', 25),
 ('conduct', 25),
 ('it.', 25),
 ('Campbells', 25),
 ('Crown', 25),
 ('disposition', 24),
 ('highly', 24),
 ('spite', 24),
 ('hurry', 24),
 ('comes', 24),
 ('guess', 24),
 ('especially', 24),
 ('influence', 24),
 ('generally', 24),
 ('vain', 24),
 ('Abbey', 24),
 ('struck', 24),
 ('admiration', 24),
 ('seems', 24),
 ('justice', 24),
 ('entered', 24),
 ('judge', 24),
 ('music', 24),
 ('write', 24),
 ('consideration', 24),
 ('thank', 24),
 ('tone', 24),
 ('instead', 24),
 ('turning', 24),
 ('behaviour', 24),
 ('instrument', 24),
 ('loss', 23),
 ('event', 23),
 ('scheme', 23),
 ('town', 23),
 ('tea', 23),
 ('horses', 23),
 ('beautiful', 23),
 ('fact', 23),
 ('Dear', 23),
 ('thinks', 23),
 ('hands', 23),
 ('chuse', 23),
 ('favourite', 23),
 ('due', 23),
 ('became', 23),
 ('understood', 23),
 ('written', 23),
 ('mine', 23),
 ('note', 23),
 ('purpose', 23),
 ('send', 23),
 ('gentlemen', 23),
 ('quarter', 23),
 ('extraordinary', 23),
 ('table', 23),
 ('join', 23),
 ('also', 23),
 ('trying', 23),
 ('age', 22),
 ('answered', 22),
 ('four', 22),
 ('proof', 22),
 ('pleasing', 22),
 ('altogether', 22),
 ('enjoyment', 22),
 ('elegant', 22),
 ('couple', 22),
 ('plain', 22),
 ('whenever', 22),
 ('telling', 22),
 ('blush', 22),
 ('quick', 22),
 ('kept', 22),
 ('Why', 22),
 ('mistaken', 22),
 ('Henry', 22),
 ('wait', 22),
 ('standing', 22),
 ('claims', 22),
 ('caught', 22),
 ('William', 22),
 ('pianoforte', 22),
 ('promise', 21),
 ('useful', 21),
 ('spent', 21),
 ('observed', 21),
 ('rain', 21),
 ('suspect', 21),
 ('uncle', 21),
 ('service', 21),
 ('weeks', 21),
 ('inclination', 21),
 ('parties', 21),
 ('scarcely', 21),
 ('After', 21),
 ('invitation', 21),
 ('summer', 21),
 ('country', 21),
 ('picture', 21),
 ('approbation', 21),
 ('information', 21),
 ('likeness', 21),
 ('try', 21),
 ('stand', 21),
 ('Here', 21),
 ('duty', 21),
 ('hopes', 21),
 ('Indeed', 21),
 ('secret', 21),
 ('Eltons', 21),
 ('niece', 21),
 ('her.', 21),
 ('dance', 21),
 ('intimacy', 20),
 ('disagreeable', 20),
 ('past', 20),
 ('hers', 20),
 ('fault', 20),
 ('disposed', 20),
 ('observe', 20),
 ('dearest', 20),
 ('respectable', 20),
 ('connexion', 20),
 ('sweet', 20),
 ('complete', 20),
 ('wishes', 20),
 ('reasonable', 20),
 ('lately', 20),
 ('deserve', 20),
 ('recommend', 20),
 ('expect', 20),
 ('imagined', 20),
 ('fixed', 20),
 ('attempt', 20),
 ('sudden', 20),
 ('delight', 20),
 ('Is', 20),
 ('agreed', 20),
 ('consciousness', 20),
 ('smallest', 20),
 ('safe', 20),
 ('silence', 20),
 ('While', 20),
 ('presently', 20),
 ('otherwise', 20),
 ('ease', 20),
 ('disappointed', 20),
 ('suspicion', 20),
 ('distress', 19),
 ('lost', 19),
 ('work', 19),
 ('sigh', 19),
 ('thoroughly', 19),
 ('James', 19),
 ('Nobody', 19),
 ('lose', 19),
 ('forgotten', 19),
 ('safely', 19),
 ('considered', 19),
 ('worst', 19),
 ('unhappy', 19),
 ('mention', 19),
 ('women', 19),
 ('encouragement', 19),
 ('fortnight', 19),
 ('chance', 19),
 ('makes', 19),
 ('anxiety', 19),
 ("n't", 19),
 ('Then', 19),
 ('eager', 19),
 ('wrote', 19),
 ('delay', 19),
 ('hint', 19),
 ('offer', 19),
 ('Nothing', 19),
 ('charade', 19),
 ('passing', 19),
 ('Have', 19),
 ('tired', 19),
 ('calling', 19),
 ('paid', 19),
 ('Bath', 19),
 ('Hawkins', 19),
 ('Hill', 19),
 ('period', 18),
 ('rational', 18),
 ('joy', 18),
 ('reflection', 18),
 ('important', 18),
 ('spend', 18),
 ('single', 18),
 ('worthy', 18),
 ('pride', 18),
 ('charge', 18),
 ('desirable', 18),
 ('friendly', 18),
 ('thrown', 18),
 ('Had', 18),
 ('high', 18),
 ('These', 18),
 ('close', 18),
 ('form', 18),
 ('busy', 18),
 ('credit', 18),
 ('_her_', 18),
 ('speech', 18),
 ('road', 18),
 ('opportunity', 18),
 ('Perhaps', 18),
 ('Can', 18),
 ('equally', 18),
 ('lines', 18),
 ('behind', 18),
 ('occurred', 18),
 ('finding', 18),
 ('severe', 18),
 ('move', 18),
 ('angry', 18),
 ('listen', 18),
 ('play', 18),
 ('Box', 18),
 ('living', 17),
 ('considering', 17),
 ('miles', 17),
 ('live', 17),
 ('possibly', 17),
 ('considerable', 17),
 ('circle', 17),
 ('concern', 17),
 ('difficulties', 17),
 ('prevent', 17),
 ('run', 17),
 ('history', 17),
 ('regular', 17),
 ('inferior', 17),
 ('exercise', 17),
 ('shewed', 17),
 ('amused', 17),
 ('However', 17),
 ('remained', 17),
 ('Did', 17),
 ('repeated', 17),
 ('low', 17),
 ('countenance', 17),
 ('gives', 17),
 ('confidence', 17),
 ('slight', 17),
 ('warmly', 17),
 ('growing', 17),
 ('effect', 17),
 ('returning', 17),
 ('disappointment', 17),
 ('simple', 17),
 ('views', 17),
 ('recollect', 17),
 ('promised', 17),
 ('ourselves', 17),
 ('joined', 17),
 ('Weymouth', 17),
 ('arrived', 17),
 ('wishing', 17),
 ('liked', 16),
 ('gentle', 16),
 ('ways', 16),
 ('suffering', 16),
 ('smiled', 16),
 ('ideas', 16),
 ('welcome', 16),
 ('fire', 16),
 ('proved', 16),
 ('whatever', 16),
 ('occupied', 16),
 ('Where', 16),
 ('depend', 16),
 ('laughing', 16),
 ('education', 16),
 ('command', 16),
 ('admired', 16),
 ('_she_', 16),
 ('cause', 16),
 ('dislike', 16),
 ('line', 16),
 ('mere', 16),
 ('arrangement', 16),
 ('waiting', 16),
 ...]
In [14]:
# 빈도순으로 n개 뿌려라!
t.plot(50)
In [15]:
# 한국어 
from konlpy.corpus import kolaw

kcorpus = kolaw.open(kolaw.fileids()[0]).read()
kterms = word_tokenize(kcorpus)
kt = Text(kterms)
In [16]:
len(kt), len(set(kt))
Out[16]:
(4640, 2023)
In [19]:
kt.vocab().most_common()
Out[19]:
[('.', 357),
 (',', 101),
 ('수', 87),
 ('①', 75),
 ('또는', 70),
 ('의하여', 66),
 ('법률이', 57),
 ('있다', 57),
 ('한다', 56),
 ('정하는', 50),
 ('그', 44),
 ('때에는', 42),
 ('관한', 39),
 ('바에', 37),
 ('국민은', 35),
 ('모든', 30),
 ('법률로', 30),
 ('정한다', 28),
 ('위하여', 26),
 ('가진다', 25),
 ('아니한다', 25),
 ('및', 25),
 ('이', 24),
 ('국회의', 23),
 ('필요한', 21),
 ('없다', 21),
 ('기타', 21),
 ('대통령은', 19),
 ('하며', 18),
 ('헌법에', 17),
 ('대통령이', 16),
 ('이를', 15),
 ('할', 15),
 ('사항은', 15),
 ('의무를', 14),
 ('국가의', 14),
 ('진다', 13),
 ('국가는', 13),
 ('이상의', 13),
 ('국회는', 13),
 ('임기는', 13),
 ('권리를', 12),
 ('관하여', 12),
 ('법률에', 11),
 ('다만', 11),
 ('대통령의', 11),
 ('대한', 10),
 ('받지', 10),
 ('의한', 10),
 ('있어서', 9),
 ('노력하여야', 9),
 ('자유를', 9),
 ('있을', 9),
 ('국회재적의원', 9),
 ('과반수의', 9),
 ('임명한다', 9),
 ('자유와', 8),
 ('받은', 8),
 ('얻어야', 8),
 ('②국가는', 7),
 ('효력을', 7),
 ('국민의', 7),
 ('보호를', 7),
 ('이에', 7),
 ('아니하고는', 7),
 ('체포', 7),
 ('받을', 7),
 ('아닌', 7),
 ('정한', 7),
 ('한', 7),
 ('한하여', 7),
 ('국회에', 7),
 ('1', 7),
 ('3분의', 7),
 ('2', 7),
 ('범위안에서', 7),
 ('동의를', 7),
 ('둘', 7),
 ('의결을', 6),
 ('대하여', 6),
 ('정부는', 6),
 ('아니하며', 6),
 ('의하지', 6),
 ('청구할', 6),
 ('지체없이', 6),
 ('아니하는', 6),
 ('위한', 6),
 ('국회의원의', 6),
 ('직무를', 6),
 ('찬성이', 6),
 ('이내에', 6),
 ('중요한', 6),
 ('있어야', 6),
 ('제정할', 6),
 ('얻어', 6),
 ('최초의', 6),
 ('보장된다', 5),
 ('따라', 5),
 ('없을', 5),
 ('기타의', 5),
 ('인하여', 5),
 ('중대한', 5),
 ('경우를', 5),
 ('제외하고는', 5),
 ('공무원의', 5),
 ('특별한', 5),
 ('국회에서', 5),
 ('대통령', 5),
 ('③대통령은', 5),
 ('때까지', 5),
 ('규칙을', 5),
 ('법률의', 5),
 ('조직·직무범위', 5),
 ('둔다', 5),
 ('심판', 5),
 ('당시의', 5),
 ('헌법시행', 5),
 ('평화적', 4),
 ('헌법을', 4),
 ('있고', 4),
 ('대한민국의', 4),
 ('②모든', 4),
 ('구속을', 4),
 ('헌법과', 4),
 ('없는', 4),
 ('근로의', 4),
 ('아니할', 4),
 ('구성한다', 4),
 ('국회의원은', 4),
 ('요구할', 4),
 ('재적의원', 4),
 ('못한', 4),
 ('정부에', 4),
 ('승인을', 4),
 ('정부의', 4),
 ('조약', 4),
 ('국무총리', 4),
 ('②제1항의', 4),
 ('저촉되지', 4),
 ('중임할', 4),
 ('발할', 4),
 ('법원의', 4),
 ('제청으로', 4),
 ('국무회의의', 4),
 ('행정각부의', 4),
 ('자문에', 4),
 ('응하기', 4),
 ('법관은', 4),
 ('6년으로', 4),
 ('균형있는', 4),
 ('조국의', 3),
 ('향상을', 3),
 ('국민투표에', 3),
 ('대한민국은', 3),
 ('정책을', 3),
 ('정치적', 3),
 ('받으며', 3),
 ('심판에', 3),
 ('가지는', 3),
 ('인정되지', 3),
 ('불리한', 3),
 ('영장을', 3),
 ('경우와', 3),
 ('당한', 3),
 ('법원에', 3),
 ('이유로', 3),
 ('③모든', 3),
 ('침해받지', 3),
 ('타인의', 3),
 ('권리는', 3),
 ('법률로써', 3),
 ('내용과', 3),
 ('정당한', 3),
 ('재판을', 3),
 ('직무상', 3),
 ('국가', 3),
 ('경우', 3),
 ('대하여는', 3),
 ('받는다', 3),
 ('③국가는', 3),
 ('관하여는', 3),
 ('보장한다', 3),
 ('아니한', 3),
 ('있으며', 3),
 ('속한다', 3),
 ('선거에', 3),
 ('4년으로', 3),
 ('요구가', 3),
 ('출석과', 3),
 ('출석의원', 3),
 ('찬성으로', 3),
 ('것으로', 3),
 ('본다', 3),
 ('법률안은', 3),
 ('제1항의', 3),
 ('법률을', 3),
 ('공포하여야', 3),
 ('후', 3),
 ('헌법이나', 3),
 ('3', 3),
 ('필요가', 3),
 ('절차', 3),
 ('국무위원이', 3),
 ('국무위원', 3),
 ('국무위원의', 3),
 ('자격을', 3),
 ('탄핵의', 3),
 ('국가를', 3),
 ('대통령을', 3),
 ('자를', 3),
 ('아니면', 3),
 ('있는', 3),
 ('수행할', 3),
 ('조직과', 3),
 ('안녕질서를', 3),
 ('명령을', 3),
 ('권한에', 3),
 ('군사에', 3),
 ('공무원과', 3),
 ('수립에', 3),
 ('국민경제의', 3),
 ('발전을', 3),
 ('장은', 3),
 ('대법관이', 3),
 ('대법원장이', 3),
 ('연임할', 3),
 ('탄핵', 3),
 ('금고', 3),
 ('형의', 3),
 ('선고에', 3),
 ('파면되지', 3),
 ('재판의', 3),
 ('경우에는', 3),
 ('재판관은', 3),
 ('정당에', 3),
 ('이용을', 3),
 ('농지의', 3),
 ('헌법개정은', 3),
 ('찬성을', 3),
 ('①이', 3),
 ('②이', 3),
 ('임명된', 3),
 ('하고', 2),
 ('사회적', 2),
 ('조화를', 2),
 ('자유민주적', 2),
 ('하여', 2),
 ('영역에', 2),
 ('기회를', 2),
 ('균등한', 2),
 ('행복을', 2),
 ('것을', 2),
 ('제1조', 2),
 ('국민에게', 2),
 ('제2조', 2),
 ('국민이', 2),
 ('되는', 2),
 ('제3조', 2),
 ('제4조', 2),
 ('통일을', 2),
 ('기본질서에', 2),
 ('제5조', 2),
 ('중립성은', 2),
 ('제6조', 2),
 ('승인된', 2),
 ('같은', 2),
 ('책임을', 2),
 ('신분과', 2),
 ('정당의', 2),
 ('활동이', 2),
 ('헌법재판소에', 2),
 ('제소할', 2),
 ('민족문화의', 2),
 ('창달에', 2),
 ('존엄과', 2),
 ('가지며', 2),
 ('앞에', 2),
 ('누구든지', 2),
 ('생활의', 2),
 ('차별을', 2),
 ('어떠한', 2),
 ('효력이', 2),
 ('적법한', 2),
 ('절차에', 2),
 ('진술을', 2),
 ('수색을', 2),
 ('검사의', 2),
 ('신청에', 2),
 ('법관이', 2),
 ('발부한', 2),
 ('제시하여야', 2),
 ('현행범인인', 2),
 ('죄를', 2),
 ('염려가', 2),
 ('즉시', 2),
 ('변호인의', 2),
 ('조력을', 2),
 ('변호인을', 2),
 ('이유와', 2),
 ('자백이', 2),
 ('부당한', 2),
 ('때', 2),
 ('유죄의', 2),
 ('행위로', 2),
 ('받거나', 2),
 ('불이익한', 2),
 ('처우를', 2),
 ('명예나', 2),
 ('배상을', 2),
 ('보호한다', 2),
 ('그에', 2),
 ('하되', 2),
 ('보상을', 2),
 ('심사할', 2),
 ('죄중', 2),
 ('선포된', 2),
 ('당해', 2),
 ('자가', 2),
 ('불법행위로', 2),
 ('공공단체에', 2),
 ('교육을', 2),
 ('⑤국가는', 2),
 ('평생교육을', 2),
 ('포함한', 2),
 ('운영', 2),
 ('근로자의', 2),
 ('근로는', 2),
 ('근로자는', 2),
 ('단결권·단체교섭권', 2),
 ('단체행동권을', 2),
 ('④국가는', 2),
 ('사유로', 2),
 ('쾌적한', 2),
 ('통하여', 2),
 ('있도록', 2),
 ('경우에', 2),
 ('보통·평등·직접·비밀선거에', 2),
 ('선출된', 2),
 ('정하되', 2),
 ('직을', 2),
 ('겸할', 2),
 ('회기중', 2),
 ('동의없이', 2),
 ('양심에', 2),
 ('행한다', 2),
 ('처분에', 2),
 ('매년', 2),
 ('회기는', 2),
 ('임시회의', 2),
 ('집회를', 2),
 ('선출한다', 2),
 ('헌법', 2),
 ('규정이', 2),
 ('공개한다', 2),
 ('의장이', 2),
 ('필요하다고', 2),
 ('인정할', 2),
 ('공개하지', 2),
 ('의결되지', 2),
 ('임기가', 2),
 ('그러하지', 2),
 ('아니하다', 2),
 ('법률안을', 2),
 ('제출할', 2),
 ('공포한다', 2),
 ('기간내에', 2),
 ('붙여', 2),
 ('재의를', 2),
 ('또한', 2),
 ('같다', 2),
 ('법률로서', 2),
 ('확정된다', 2),
 ('규정에', 2),
 ('확정된', 2),
 ('날로부터', 2),
 ('예산안을', 2),
 ('편성하여', 2),
 ('회계연도', 2),
 ('개시', 2),
 ('의결하여야', 2),
 ('예산안이', 2),
 ('다음의', 2),
 ('목적을', 2),
 ('경비는', 2),
 ('예산에', 2),
 ('정하여', 2),
 ('정부가', 2),
 ('제출한', 2),
 ('부담이', 2),
 ('될', 2),
 ('동의권을', 2),
 ('②국회는', 2),
 ('국정을', 2),
 ('제출', 2),
 ('국무총리·국무위원', 2),
 ('정부위원은', 2),
 ('출석하여', 2),
 ('의견을', 2),
 ('해임을', 2),
 ('대통령에게', 2),
 ('건의할', 2),
 ('내부규율에', 2),
 ('공무원이', 2),
 ('소추를', 2),
 ('탄핵소추는', 2),
 ('의결은', 2),
 ('대통령에', 2),
 ('자는', 2),
 ('형사상의', 2),
 ('②대통령은', 2),
 ('얻은', 2),
 ('후임자를', 2),
 ('선거한다', 2),
 ('60일', 2),
 ('노력하여', 2),
 ('국민', 2),
 ('권한을', 2),
 ('사항에', 2),
 ('재정·경제상의', 2),
 ('안전보장', 2),
 ('공공의', 2),
 ('긴급한', 2),
 ('조치가', 2),
 ('필요하고', 2),
 ('때에', 2),
 ('처분을', 2),
 ('관계되는', 2),
 ('제2항의', 2),
 ('처분', 2),
 ('얻지', 2),
 ('국무총리와', 2),
 ('관계', 2),
 ('국무총리는', 2),
 ('보좌하며', 2),
 ('현역을', 2),
 ('면한', 2),
 ('후가', 2),
 ('임명될', 2),
 ('국무총리의', 2),
 ('심의한다', 2),
 ('국무위원으로', 2),
 ('속하는', 2),
 ('이상', 2),
 ('이하의', 2),
 ('된다', 2),
 ('다음', 2),
 ('국정의', 2),
 ('4', 2),
 ('중요사항', 2),
 ('5', 2),
 ('권한의', 2),
 ('위임', 2),
 ('정당해산의', 2),
 ('직전대통령이', 2),
 ('세입·세출의', 2),
 ('감사원은', 2),
 ('임명하고', 2),
 ('1차에', 2),
 ('자격은', 2),
 ('대법원장의', 2),
 ('③대법원장과', 2),
 ('법관의', 2),
 ('위반되는', 2),
 ('여부가', 2),
 ('전제가', 2),
 ('된', 2),
 ('대법원은', 2),
 ('내부규율과', 2),
 ('사무처리에', 2),
 ('재판관의', 2),
 ('헌법재판소', 2),
 ('헌법소원에', 2),
 ('②헌법재판소는', 2),
 ('3인은', 2),
 ('선출하는', 2),
 ('지명하는', 2),
 ('가입하거나', 2),
 ('정치에', 2),
 ('관여할', 2),
 ('결정', 2),
 ('사무를', 2),
 ('법령의', 2),
 ('선거관리위원회의', 2),
 ('각급', 2),
 ('지시를', 2),
 ('지방자치단체의', 2),
 ('기간', 2),
 ('계획을', 2),
 ('헌법개정', 2),
 ('개시한다', 2),
 ('전일까지로', 2),
 ('후임자가', 2),
 ('설치될', 2),
 ('대한민국헌법', 1),
 ('유구한', 1),
 ('역사와', 1),
 ('전통에', 1),
 ('빛나는', 1),
 ('우리', 1),
 ('대한국민은', 1),
 ('3·1운동으로', 1),
 ('건립된', 1),
 ('대한민국임시정부의', 1),
 ('법통과', 1),
 ('불의에', 1),
 ('항거한', 1),
 ('4·19민주이념을', 1),
 ('계승하고', 1),
 ('민주개혁과', 1),
 ('통일의', 1),
 ('사명에', 1),
 ('입각하여', 1),
 ('정의·인도와', 1),
 ('동포애로써', 1),
 ('민족의', 1),
 ('단결을', 1),
 ('공고히', 1),
 ('폐습과', 1),
 ('불의를', 1),
 ('타파하며', 1),
 ('자율과', 1),
 ('바탕으로', 1),
 ('기본질서를', 1),
 ('더욱', 1),
 ('확고히', 1),
 ('정치·경제·사회·문화의', 1),
 ('각인의', 1),
 ('균등히', 1),
 ('능력을', 1),
 ('최고도로', 1),
 ('발휘하게', 1),
 ('권리에', 1),
 ('따르는', 1),
 ('책임과', 1),
 ('완수하게', 1),
 ('안으로는', 1),
 ('국민생활의', 1),
 ('기하고', 1),
 ('밖으로는', 1),
 ('항구적인', 1),
 ('세계평화와', 1),
 ('인류공영에', 1),
 ('이바지함으로써', 1),
 ('우리들과', 1),
 ('우리들의', 1),
 ('자손의', 1),
 ('안전과', 1),
 ('영원히', 1),
 ('확보할', 1),
 ('다짐하면서', 1),
 ('1948년', 1),
 ('7월', 1),
 ('12일에', 1),
 ('제정되고', 1),
 ('8차에', 1),
 ('걸쳐', 1),
 ('개정된', 1),
 ('이제', 1),
 ('거쳐', 1),
 ('개정한다', 1),
 ('제1장', 1),
 ('총강', 1),
 ('민주공화국이다', 1),
 ('②대한민국의', 1),
 ('주권은', 1),
 ('권력은', 1),
 ('국민으로부터', 1),
 ('나온다', 1),
 ('요건은', 1),
 ('재외국민을', 1),
 ('보호할', 1),
 ('영토는', 1),
 ('한반도와', 1),
 ('부속도서로', 1),
 ('지향하며', 1),
 ('입각한', 1),
 ('통일', 1),
 ('수립하고', 1),
 ('추진한다', 1),
 ('국제평화의', 1),
 ('유지에', 1),
 ('노력하고', 1),
 ('침략적', 1),
 ('전쟁을', 1),
 ('부인한다', 1),
 ('②국군은', 1),
 ('안전보장과', 1),
 ('국토방위의', 1),
 ('신성한', 1),
 ('수행함을', 1),
 ('사명으로', 1),
 ('준수된다', 1),
 ('체결·공포된', 1),
 ('조약과', 1),
 ('일반적으로', 1),
 ('국제법규는', 1),
 ('국내법과', 1),
 ('②외국인은', 1),
 ('국제법과', 1),
 ('조약이', 1),
 ('지위가', 1),
 ('제7조', 1),
 ('공무원은', 1),
 ('국민전체에', 1),
 ('봉사자이며', 1),
 ('국민에', 1),
 ('②공무원의', 1),
 ('제8조', 1),
 ('설립은', 1),
 ('자유이며', 1),
 ('복수정당제는', 1),
 ('②정당은', 1),
 ('목적·조직과', 1),
 ('민주적이어야', 1),
 ('의사형성에', 1),
 ('참여하는데', 1),
 ('조직을', 1),
 ('가져야', 1),
 ('③정당은', 1),
 ('정당운영에', 1),
 ('자금을', 1),
 ('보조할', 1),
 ('④정당의', 1),
 ('목적이나', 1),
 ('민주적', 1),
 ('위배될', 1),
 ('해산을', 1),
 ('정당은', 1),
 ('헌법재판소의', 1),
 ('해산된다', 1),
 ('제9조', 1),
 ('전통문화의', 1),
 ('계승·발전과', 1),
 ('제2장', 1),
 ('권리와', 1),
 ('의무', 1),
 ('제10조', 1),
 ('인간으로서의', 1),
 ('가치를', 1),
 ('추구할', 1),
 ('개인이', 1),
 ('불가침의', 1),
 ('기본적', 1),
 ('인권을', 1),
 ('확인하고', 1),
 ('보장할', 1),
 ('제11조', 1),
 ('법', 1),
 ('평등하다', 1),
 ('성별·종교', 1),
 ('신분에', 1),
 ('정치적·경제적·사회적·문화적', 1),
 ('②사회적', 1),
 ('특수계급의', 1),
 ('제도는', 1),
 ('형태로도', 1),
 ('창설할', 1),
 ('③훈장등의', 1),
 ('영전은', 1),
 ('자에게만', 1),
 ('특권도', 1),
 ('따르지', 1),
 ('제12조', 1),
 ('신체의', 1),
 ('체포·구속·압수·수색', 1),
 ('심문을', 1),
 ('법률과', 1),
 ('처벌·보안처분', 1),
 ('강제노역을', 1),
 ('고문을', 1),
 ('형사상', 1),
 ('자기에게', 1),
 ('강요당하지', 1),
 ('③체포·구속·압수', 1),
 ('장기', 1),
 ('3년', 1),
 ('형에', 1),
 ('해당하는', 1),
 ('범하고', 1),
 ('도피', 1),
 ('증거인멸의', 1),
 ('사후에', 1),
 ('④누구든지', 1),
 ('형사피고인이', 1),
 ('스스로', 1),
 ('구할', 1),
 ('국가가', 1),
 ('붙인다', 1),
 ('⑤누구든지', 1),
 ('구속의', 1),
 ('권리가', 1),
 ('있음을', 1),
 ('고지받지', 1),
 ('당하지', 1),
 ('자의', 1),
 ('가족등', 1),
 ('자에게는', 1),
 ('일시·장소가', 1),
 ('통지되어야', 1),
 ('⑥누구든지', 1),
 ('적부의', 1),
 ('심사를', 1),
 ('⑦피고인의', 1),
 ('고문·폭행·협박·구속의', 1),
 ('장기화', 1),
 ('기망', 1),
 ('방법에', 1),
 ('자의로', 1),
 ('진술된', 1),
 ('것이', 1),
 ('아니라고', 1),
 ('인정될', 1),
 ('정식재판에', 1),
 ('피고인의', 1),
 ('그에게', 1),
 ('유일한', 1),
 ('증거일', 1),
 ('증거로', 1),
 ('삼거나', 1),
 ('처벌할', 1),
 ('제13조', 1),
 ('행위시의', 1),
 ('범죄를', 1),
 ('구성하지', 1),
 ('소추되지', 1),
 ('동일한', 1),
 ('범죄에', 1),
 ('거듭', 1),
 ('처벌받지', 1),
 ('소급입법에', 1),
 ('참정권의', 1),
 ('제한을', 1),
 ('재산권을', 1),
 ('박탈당하지', 1),
 ('자기의', 1),
 ('행위가', 1),
 ('친족의', 1),
 ('제14조', 1),
 ('거주·이전의', 1),
 ('제15조', 1),
 ('직업선택의', 1),
 ('제16조', 1),
 ('주거의', 1),
 ('주거에', 1),
 ('압수나', 1),
 ('제17조', 1),
 ('사생활의', 1),
 ('비밀과', 1),
 ('제18조', 1),
 ('통신의', 1),
 ('비밀을', 1),
 ('제19조', 1),
 ('양심의', 1),
 ('제20조', 1),
 ('종교의', 1),
 ('②국교는', 1),
 ('종교와', 1),
 ('정치는', 1),
 ('분리된다', 1),
 ('제21조', 1),
 ('언론·출판의', 1),
 ('집회·결사의', 1),
 ('②언론·출판에', 1),
 ('허가나', 1),
 ('검열과', 1),
 ('집회·결사에', 1),
 ('허가는', 1),
 ('③통신·방송의', 1),
 ('시설기준과', 1),
 ('신문의', 1),
 ('기능을', 1),
 ('보장하기', 1),
 ('④언론·출판은', 1),
 ('권리', 1),
 ('공중도덕이나', 1),
 ('사회윤리를', 1),
 ('침해하여서는', 1),
 ('아니된다', 1),
 ('언론·출판이', 1),
 ('침해한', 1),
 ('피해자는', 1),
 ('피해의', 1),
 ('제22조', 1),
 ('학문과', 1),
 ('예술의', 1),
 ('②저작자·발명가·과학기술자와', 1),
 ('예술가의', 1),
 ('제23조', 1),
 ('재산권은', 1),
 ('한계는', 1),
 ('②재산권의', 1),
 ('행사는', 1),
 ('공공복리에', 1),
 ('적합하도록', 1),
 ('하여야', 1),
 ('③공공필요에', 1),
 ('재산권의', 1),
 ('수용·사용', 1),
 ('제한', 1),
 ('보상은', 1),
 ('지급하여야', 1),
 ('제24조', 1),
 ('선거권을', 1),
 ('제25조', 1),
 ('공무담임권을', 1),
 ('제26조', 1),
 ('국가기관에', 1),
 ('문서로', 1),
 ('청원할', 1),
 ('청원에', 1),
 ('제27조', 1),
 ('법관에', 1),
 ('②군인', 1),
 ('군무원이', 1),
 ('영역안에서는', 1),
 ('군사상', 1),
 ('기밀·초병·초소·유독음식물공급·포로·군용물에', 1),
 ('비상계엄이', 1),
 ('군사법원의', 1),
 ('신속한', 1),
 ('형사피고인은', 1),
 ('상당한', 1),
 ('이유가', 1),
 ('공개재판을', 1),
 ('④형사피고인은', 1),
 ('판결이', 1),
 ('확정될', 1),
 ('때까지는', 1),
 ('무죄로', 1),
 ('추정된다', 1),
 ('⑤형사피해자는', 1),
 ('사건의', 1),
 ('재판절차에서', 1),
 ('진술할', 1),
 ('제28조', 1),
 ('형사피의자', 1),
 ('형사피고인으로서', 1),
 ('구금되었던', 1),
 ('불기소처분을', 1),
 ('무죄판결을', 1),
 ('국가에', 1),
 ('제29조', 1),
 ('손해를', 1),
 ('공무원', 1),
 ('자신의', 1),
 ('책임은', 1),
 ('면제되지', 1),
 ('②군인·군무원·경찰공무원', 1),
 ('전투·훈련등', 1),
 ('직무집행과', 1),
 ('관련하여', 1),
 ('손해에', 1),
 ('보상외에', 1),
 ('인한', 1),
 ('배상은', 1),
 ('제30조', 1),
 ('범죄행위로', 1),
 ('생명·신체에', 1),
 ('피해를', 1),
 ('국가로부터', 1),
 ('구조를', 1),
 ('제31조', 1),
 ('능력에', 1),
 ('균등하게', 1),
 ('보호하는', 1),
 ('자녀에게', 1),
 ('적어도', 1),
 ('초등교육과', 1),
 ('받게', 1),
 ('③의무교육은', 1),
 ('무상으로', 1),
 ('④교육의', 1),
 ('자주성·전문성·정치적', 1),
 ('중립성', 1),
 ('대학의', 1),
 ('자율성은', 1),
 ('진흥하여야', 1),
 ('⑥학교교육', 1),
 ('교육제도와', 1),
 ('교육재정', 1),
 ('교원의', 1),
 ('지위에', 1),
 ('기본적인', 1),
 ('제32조', 1),
 ('사회적·경제적', 1),
 ('방법으로', 1),
 ('고용의', 1),
 ('증진과', 1),
 ('적정임금의', 1),
 ('보장에', 1),
 ('최저임금제를', 1),
 ('시행하여야', 1),
 ('의무의', 1),
 ('조건을', 1),
 ('민주주의원칙에', 1),
 ('③근로조건의', 1),
 ('기준은', 1),
 ('인간의', 1),
 ('존엄성을', 1),
 ('보장하도록', 1),
 ('④여자의', 1),
 ('고용·임금', 1),
 ('근로조건에', 1),
 ('⑤연소자의', 1),
 ('⑥국가유공자·상이군경', 1),
 ('전몰군경의', 1),
 ('유가족은', 1),
 ('우선적으로', 1),
 ('부여받는다', 1),
 ('제33조', 1),
 ('근로조건의', 1),
 ('자주적인', 1),
 ('②공무원인', 1),
 ('자에', 1),
 ('③법률이', 1),
 ('주요방위산업체에', 1),
 ('종사하는', 1),
 ('단체행동권은', 1),
 ('제한하거나', 1),
 ('인정하지', 1),
 ('제34조', 1),
 ('인간다운', 1),
 ('생활을', 1),
 ('사회보장·사회복지의', 1),
 ('증진에', 1),
 ('노력할', 1),
 ('여자의', 1),
 ('복지와', 1),
 ('권익의', 1),
 ('노인과', 1),
 ('청소년의', 1),
 ('복지향상을', 1),
 ('실시할', 1),
 ('⑤신체장애자', 1),
 ('질병·노령', 1),
 ('생활능력이', 1),
 ('⑥국가는', 1),
 ('재해를', 1),
 ('예방하고', 1),
 ('위험으로부터', 1),
 ('국민을', 1),
 ('보호하기', 1),
 ('제35조', 1),
 ('건강하고', 1),
 ('환경에서', 1),
 ('생활할', 1),
 ('국가와', 1),
 ('환경보전을', 1),
 ('②환경권의', 1),
 ('행사에', 1),
 ('주택개발정책등을', 1),
 ('주거생활을', 1),
 ('제36조', 1),
 ('혼인과', 1),
 ('가족생활은', 1),
 ('개인의', 1),
 ('양성의', 1),
 ('평등을', 1),
 ('기초로', 1),
 ('성립되고', 1),
 ('유지되어야', 1),
 ('모성의', 1),
 ('보건에', 1),
 ('제37조', 1),
 ('열거되지', 1),
 ('경시되지', 1),
 ('②국민의', 1),
 ('국가안전보장·질서유지', 1),
 ('공공복리를', 1),
 ('제한할', 1),
 ('제한하는', 1),
 ('경우에도', 1),
 ('권리의', 1),
 ('본질적인', 1),
 ('내용을', 1),
 ('침해할', 1),
 ('제38조', 1),
 ('납세의', 1),
 ('제39조', 1),
 ('국방의', 1),
 ('②누구든지', 1),
 ('병역의무의', 1),
 ('이행으로', 1),
 ('제3장', 1),
 ('국회', 1),
 ('제40조', 1),
 ('입법권은', 1),
 ('제41조', 1),
 ('국회의원으로', 1),
 ('②국회의원의', 1),
 ('수는', 1),
 ('200인', 1),
 ('이상으로', 1),
 ('③국회의원의', 1),
 ('선거구와', 1),
 ('비례대표제', 1),
 ('제42조', 1),
 ('제43조', 1),
 ('제44조', 1),
 ('구금되지', 1),
 ...]
In [32]:
# 한글출력 1 

# import matplotlib as plt
# plt.rcParams['font.family'] = 'NanumBarunGothic'
# plt.rcParams['axes.unicode_minus'] = False

# 한글출력 2 
from matplotlib import font_manager, rc

path='/Library/Fonts/AppleGothic.ttf'

family = font_manager.FontProperties(fname=path).get_name()
rc('font' ,family=family)

kt.plot(50)
In [44]:
# 다시 영어 소설(t) ,한글 헌법(kt)
# count -> 단어의 갯수를 반환

t.count('Emma'), t.count('the'), t.count('The')
kt.count("국민"), kt.count("국회")
Out[44]:
(2, 1)

검색엔진

질의(query)

검색결과

제목-링크

본문(질의위치)

In [47]:
# parameter는 word 한 개
# 전 후 몇 라인들이 return 됨. 
t.concordance('Emma')
kt.concordance('국민')
Displaying 25 of 855 matches:
[ Emma by Jane Austen 1816 ] VOLUME I CHAPT
ane Austen 1816 ] VOLUME I CHAPTER I Emma Woodhouse , handsome , clever , and 
both daughters , but particularly of Emma . Between _them_ it was more the int
 friend very mutually attached , and Emma doing just what she liked ; highly e
r own . The real evils , indeed , of Emma 's situation were the power of havin
ding-day of this beloved friend that Emma first sat in mournful thought of any
ing only half a mile from them ; but Emma was aware that great must be the dif
y . It was a melancholy change ; and Emma could not but sigh over it , and wis
 the rest of her life at Hartfield . Emma smiled and chatted as cheerfully as 
able to tell her how we all are . '' Emma spared no exertions to maintain this
 ' I have a great regard for you and Emma ; but when it comes to the question 
ful , troublesome creature ! '' said Emma playfully . `` That is what you have
e few people who could see faults in Emma Woodhouse , and the only one who eve
is was not particularly agreeable to Emma herself , she knew it would be so mu
g thought perfect by every body . `` Emma knows I never flatter her , '' said 
t be a gainer . '' `` Well , '' said Emma , willing to let it pass -- '' you w
re of meeting every day . '' `` Dear Emma bears every thing so well , '' said 
ss her more than she thinks for . '' Emma turned away her head , divided betwe
nd smiles . `` It is impossible that Emma should not miss such a companion , '
en one matter of joy to me , '' said Emma , '' and a very considerable one -- 
od to them , by interference . '' `` Emma never thinks of herself , if she can
etter thing . Invite him to dinner , Emma , and help him to the best of the fi
 could not think , without pain , of Emma 's losing a single pleasure , or suf
 of her companionableness : but dear Emma was of no feeble character ; she was
, was so just and so apparent , that Emma , well as she knew her father , was 
Displaying 2 of 2 matches:
민족문화의 창달에 노력하여 대통령으로서의 직책을 성실히 수행할 것을 국민 앞에 엄숙히 선서합니다 . '' 제70조 대통령의 임기는 5년으로 
위탁경영은 법률이 정하는 바에 의하여 인정된다 . 제122조 국가는 국민 모두의 생산 및 생활의 기반이 되는 국토의 효율적이고 균형있는 이용
In [48]:
# 같은 위치에 나온 다른 단어를 찾음
# 주어에 해당하기에 다른 인물, 성별을 추측해볼 수 있다. 
t.similar("Emma")
she it he i weston you her harriet elton him me knightley jane that
and the all there they them
In [50]:
# 단어들이 어떻게 분포해있는지를 한 눈에 보여줌 
# 문맥상 역할을 알 수도 있음 
t.dispersion_plot(["Emma","Jane"])

모아둔 뉴스기사 불러오기

In [124]:
import os

path = "./News"
fileList = [file for file in os.listdir(path) if file.startswith("경제") and file.endswith('.txt')]
In [125]:
fileList
Out[125]:
['경제_0004188602.txt',
 '경제_0009113386.txt',
 '경제_0002220441.txt',
 '경제_0004330537.txt',
 '경제_0009113330.txt',
 '경제_0010694192.txt',
 '경제_0010694746.txt',
 '경제_0003882516.txt',
 '경제_0002891478.txt',
 '경제_0002891452.txt']
In [128]:
# 파일 열기
# for문 붙여서 전체 통 데이터 불러오기 
# for 빼고 file = fileList / news= fp.read() 하면 하나만 가져와짐 
for file in fileList:
    with open(path+"/"+file) as fp:
        news += "\n"+ fp.read()
In [168]:
len(news.splitlines())
type(news)
Out[168]:
str
In [134]:
newsT = Text(word_tokenize(news))
len(newsT), len(set(newsT)), newsT.vocab().most_common()
newsT.plot(20)

n-gram

n의 자리에 숫자가 들어간다.

1 = unigram ( 기존과 같음 )

In [ ]:
Beautiful is better than ugly
   (0)       (1)  (2)     (3)    (4) 
    n-gram => chunk로 생각하기 
    (0) (1) => new1
    => 어절 단위의 ngram
    
    국민의, 국민을
    국민, 민의, 국민, 민을  (바이그램, 음절단위)
    
P(ugly|Beautiful, is, better, than)
P(than | Beautiful,is, better)
P(better | Beautiful,is )
P(is | Beautiful)
P(Beautiful)

-> 확률을 count /freq로 구할 것이다. 

P(Beautiful) = freq(Beautiful)/freq(tokens)
둘중에 하나가 0 되버리는경우가 너무 많음  
=> 1st Markov Assumption : 바로 앞에 있던 애들만  
    
P(ugly| than)
P(than | better)
P(better | is )
P(is | Beautiful)
P(Beautiful)

어절 단위의 n-gram ( n = 2 )

In [149]:
def ngramEojeol(sentence, n=2):
#     입력 : 단어1, 단어2, 단어3, 단어4 : 4
#     출력(2) : 단어12, 단어23, 단어34 : 3 - n + 1
#     출력(3) : 단어123, 단어234 : 2 - n + 1 
    tokens =sentence.split()
    ngram = []
    
    for i in range(len(tokens) - n + 1):
        ngram.append(" ".join(tokens[i:i+n]))
    
    return ngram

ngramEojeol("동해물과 백두산이 마르고 닳도록")
# ngramEojeol("동해물과 백두산이 마르고 닳도록", n=3) # trigram
Out[149]:
['동해물과 백두산이', '백두산이 마르고', '마르고 닳도록']

음절 단위의 n-gram

In [113]:
def ngramUmjeol(term, n=2):
#     입력 : 단어1, 단어2, 단어3, 단어4 : 4
#     출력(2) : 단어12, 단어23, 단어34 : 3 - n + 1
#     출력(3) : 단어123, 단어234 : 2 - n + 1 
    ngram = []
    
    for i in range(len(term) - n + 1):
#          ngram.append(tuple(term[i:i+n]))
        ngram.append("".join(tuple(term[i:i+n])))   # 음절을 붙임
    return ngram

ngramUmjeol("국민의국민을")
Out[113]:
['국민', '민의', '의국', '국민', '민을']
In [135]:
bigram = Text(ngramEojeol(news))
bigram
Out[135]:
<Text: ('//', 'flash') ('flash', '오류를') ('오류를', '우회하기') ('우회하기', '위한') ('위한', '함수') ('함수', '추가') ('추가', 'function') ('function', '_flash_removeCallback()')...>
In [136]:
bigram, len(bigram), len(set(bigram))  # 쌍
Out[136]:
(<Text: ('//', 'flash') ('flash', '오류를') ('오류를', '우회하기') ('우회하기', '위한') ('위한', '함수') ('함수', '추가') ('추가', 'function') ('function', '_flash_removeCallback()')...>,
 4056,
 3418)
In [138]:
bigram.vocab().most_common(20)
# 거르지 않았던 문자가 나옴 
Out[138]:
[(('//', 'flash'), 11),
 (('flash', '오류를'), 11),
 (('오류를', '우회하기'), 11),
 (('우회하기', '위한'), 11),
 (('위한', '함수'), 11),
 (('함수', '추가'), 11),
 (('추가', 'function'), 11),
 (('function', '_flash_removeCallback()'), 11),
 (('_flash_removeCallback()', '{}'), 11),
 (('및', '재배포'), 6),
 (('경남', '거제시'), 6),
 (('보통주', '1주당'), 6),
 (('의결권', '행사'), 5),
 (('당시', '사외이사로서'), 5),
 (('받고', '있다.'), 5),
 (('무급휴직자', '400여'), 4),
 (('400여', '명을'), 4),
 (('올해', '안에'), 4),
 (('무급휴직에', '들어간'), 4),
 (('무단전재', '및'), 4)]
In [139]:
bigram = Text(ngramUmjeol(news))
bigram
Out[139]:
<Text: 

 

 

 

 

 
/ // / ...>
In [140]:
bigram, len(bigram), len(set(bigram))  # 쌍
Out[140]:
(<Text: 
 
  
 
  
 
  
 
  
 
  
 / // / ...>, 20086, 4535)
In [141]:
bigram.vocab().most_common(20)
# 불용어 처리가 필요해 보임 ( 이전 단계에서 )
Out[141]:
[('  ', 437),
 ('다.', 244),
 ('는 ', 230),
 ('. ', 215),
 ('을 ', 174),
 ('에 ', 164),
 ('의 ', 163),
 ('로 ', 127),
 ('이 ', 125),
 ('를 ', 122),
 ('했다', 113),
 ('은 ', 113),
 (', ', 112),
 (' 이', 97),
 ('한 ', 95),
 ('고 ', 94),
 (' 현', 92),
 (' 1', 89),
 ('서 ', 84),
 ('\n\n', 80)]
In [145]:
# 다시 영문 소설 가지고
bigram = Text(ngramEojeol(corpus,n=3))
bigram
Out[145]:
<Text: ('[Emma', 'by', 'Jane') ('by', 'Jane', 'Austen') ('Jane', 'Austen', '1816]') ('Austen', '1816]', 'VOLUME') ('1816]', 'VOLUME', 'I') ('VOLUME', 'I', 'CHAPTER') ('I', 'CHAPTER', 'I') ('CHAPTER', 'I', 'Emma')...>
In [146]:
bigram, len(bigram), len(set(bigram)),bigram.vocab().most_common(10)  # 쌍
Out[146]:
(<Text: ('[Emma', 'by', 'Jane') ('by', 'Jane', 'Austen') ('Jane', 'Austen', '1816]') ('Austen', '1816]', 'VOLUME') ('1816]', 'VOLUME', 'I') ('VOLUME', 'I', 'CHAPTER') ('I', 'CHAPTER', 'I') ('CHAPTER', 'I', 'Emma')...>,
 158165,
 137443,
 [(('I', 'do', 'not'), 94),
  (('I', 'am', 'sure'), 75),
  (('would', 'have', 'been'), 55),
  (('a', 'great', 'deal'), 55),
  (('she', 'could', 'not'), 49),
  (('could', 'not', 'be'), 45),
  (('she', 'had', 'been'), 44),
  (('it', 'would', 'be'), 43),
  (('do', 'not', 'know'), 43),
  (('Mr.', 'and', 'Mrs.'), 37)])
In [148]:
bigram = Text(ngramUmjeol(corpus,n=3))
bigram, len(bigram), len(set(bigram)),bigram.vocab().most_common(10)  # 쌍
Out[148]:
(<Text: [Em Emm mma ma  a b  by by  y J...>,
 887069,
 9395,
 [(' th', 10154),
  ('he ', 8460),
  ('the', 8005),
  ('nd ', 5560),
  (' to', 5266),
  ('ing', 5258),
  ('and', 5257),
  ('to ', 5177),
  ('her', 4911),
  (' an', 4835)])

욕 필터링하기

각 음절 사이에 < = whitespace(' ') , 공백이 나오면 "_" 치환

각 단어의 끝에 < = < /w>

데이터 : 각 단어의 빈도

  1. 음절을 쪼개기

  2. 패턴을 만들기 (n=2)

  3. 그중에 빈도가 가장 높은 케이스를 찾아야함 (베스트 케이스) 빈도 높은 1개

  4. 병합

  5. 2번 반복 N만큼

In [240]:
def splitTerms(term):
#     t = term.replace(" ","_")   # 띄어쓰기 치환
    termList = term.split()
    result = []
    
    for t in term.split():
        result.append(" ".join(list(t)+['</w>']))
        
    return " _ ".join(result)
#     print(len(term))

data

  1. (l,o) (o w)(w </ w>)
In [243]:
from collections import defaultdict
In [244]:
def findNgram(tokens,n=2):
    result = defaultdict(int)#{}
    
    for k,v in tokens.items():
        term = k.split()
        
        for i in range(len(term) - n + 1):
            ngram = (term[i], term[i+1])
            result[ngram] += v
            
#             if ngram in result.keys():
#                 result[ngram] += v
#             else:
#                 result[ngram] = v
                
    return result
            
In [245]:
# 키 벨류 쌍으로 나옴 
findNgram(data)
Out[245]:
defaultdict(int,
            {('l', 'o'): 7,
             ('o', 'w'): 7,
             ('w', '</w>'): 5,
             ('w', 'e'): 8,
             ('e', 's'): 2,
             ('s', 't'): 2,
             ('t', '</w>'): 2,
             ('n', 'e'): 6,
             ('e', 'w'): 6,
             ('e', 'r'): 9,
             ('r', '</w>'): 9,
             ('w', 'i'): 3,
             ('i', 'd'): 3,
             ('d', 'e'): 3})
In [246]:
pattern = findNgram(data)

maxkey= None
maxval = 0
for k,v in pattern.items():
    if v > maxval:
        maxval = v
        maxkey = k
print(maxkey,maxval)
('e', 'r') 9
In [247]:
# 위를 더 좋게 바꿔봄
maxkey= None
maxval = 0
# maxkey = max(pattern, key = lambda x:pattern[x])

# maxkey = max(pattern, key = lambda x:pattern.get(x))

maxkey = max(pattern, key = pattern.get)

print(maxkey)


# print(max(pattern, key = lambda x:pattern[x]))
('e', 'r')
In [248]:
import re

def mergeNgram(maxkey,data):
    result = defaultdict(int)
    
    token = " ".join(maxkey)
    pattern = re.compile(r"(?!=\S)" + token + "(?!\S)" )    # ~로 시작하지 않는 / ~로 끝나지 않는 ( = "?<!")  화이트 스페이스가 아닌
   
    for k,v in data.items():
        new = pattern.sub("".join(maxkey),k)
        result[new] = v
        
    
    return result

https://regexr.com 에서 정규식 검사

In [250]:
data = {
    splitTerms("low"):5,
    splitTerms("lowest"):2,
    splitTerms("newer"):6,
    splitTerms("wider"):3,
}

for i in range(5):
    pattern = findNgram(data)

    maxkey= None
    maxval = 0
    maxkey = max(pattern, key = pattern.get)
    data = mergeNgram(maxkey,data)
    print(maxkey)
    print(data)
#     print(maxkey,maxval)

# er끼리 붙음
('e', 'r')
defaultdict(<class 'int'>, {'l o w </w>': 5, 'l o w e s t </w>': 2, 'n e w er </w>': 6, 'w i d er </w>': 3})
('er', '</w>')
defaultdict(<class 'int'>, {'l o w </w>': 5, 'l o w e s t </w>': 2, 'n e w er</w>': 6, 'w i d er</w>': 3})
('l', 'o')
defaultdict(<class 'int'>, {'lo w </w>': 5, 'lo w e s t </w>': 2, 'n e w er</w>': 6, 'w i d er</w>': 3})
('lo', 'w')
defaultdict(<class 'int'>, {'low </w>': 5, 'low e s t </w>': 2, 'n e w er</w>': 6, 'w i d er</w>': 3})
('n', 'e')
defaultdict(<class 'int'>, {'low </w>': 5, 'low e s t </w>': 2, 'ne w er</w>': 6, 'w i d er</w>': 3})
In [ ]:
 
In [ ]:
 


728x90
반응형