👨🏻‍🏫IT 활동/인공지능교육 - NLP

[NLP] Day 13 - POS Taggers

728x90
반응형

POS ( Part Of Speech )

In [9]:
import re
from string import punctuation
from nltk.tokenize import word_tokenize

sentence = "She sells seashells on the seashore."
# tokens = word_tokenize(sentence)

pattern = re.compile(r"[{0}]".format(re.escape(punctuation)))

sentence = pattern.sub("",sentence)
tokens = word_tokenize(sentence)

result=[]
for token in tokens:
    if pattern.search(token):
        print(token)
    else:
        result.append(token.lower())  # 다 소문자로
        
result
Out[9]:
['she', 'sells', 'seashells', 'on', 'the', 'seashore']
In [2]:
tokens
Out[2]:
['She', 'sells', 'seashells', 'on', 'the', 'seashore', '.']
In [25]:
from nltk import pos_tag
# import nltk
# nltk.download('averaged_perceptron_tagger')
pos_tag(result)  

# 품사로도 거를 수 있음 

tags =[]
for term in pos_tag(result):
    if term[1] in ["IN","DT"]:
        print("Skipped",term[0])
    else:
        print(term[0])
        tags.append(term[0])
        
tags
she
sells
seashells
Skipped on
Skipped the
seashore
Out[25]:
['she', 'sells', 'seashells', 'seashore']
In [20]:
# import nltk
# nltk.download('tagsets')
from nltk.help import brown_tagset, upenn_tagset

# 태그셋마다 품사가 다름 !

upenn_tagset("N.*")  # 위는 이 태그셋을 씀       / N.*  : N으로 시작하는거면 모두 다 
# brown_tagset()
NN: noun, common, singular or mass
    common-carrier cabbage knuckle-duster Casino afghan shed thermostat
    investment slide humour falloff slick wind hyena override subhumanity
    machinist ...
NNP: noun, proper, singular
    Motown Venneboerger Czestochwa Ranzer Conchita Trumplane Christos
    Oceanside Escobar Kreisler Sawyer Cougar Yvette Ervin ODI Darryl CTCA
    Shannon A.K.C. Meltex Liverpool ...
NNPS: noun, proper, plural
    Americans Americas Amharas Amityvilles Amusements Anarcho-Syndicalists
    Andalusians Andes Andruses Angels Animals Anthony Antilles Antiques
    Apache Apaches Apocrypha ...
NNS: noun, common, plural
    undergraduates scotches bric-a-brac products bodyguards facets coasts
    divestitures storehouses designs clubs fragrances averages
    subjectivists apprehensions muses factory-jobs ...
In [21]:
upenn_tagset("VBZ")
VBZ: verb, present tense, 3rd person singular
    bases reconstructs marks mixes displeases seals carps weaves snatches
    slumps stretches authorizes smolders pictures emerges stockpiles
    seduces fizzes uses bolsters slaps speaks pleads ...
In [27]:
# 다른 문장

sentence = "The little yellow dog barked at the Persian Cat."

pattern = re.compile(r"[{0}]".format(re.escape(punctuation)))

sentence = pattern.sub("",sentence)
tokens = word_tokenize(sentence)

result=[]
for token in tokens:
    if pattern.search(token):
        print(token)
    else:
        result.append(token.lower())  # 다 소문자로
        
result


tags =[]
for term in pos_tag(result):
    if term[1] in ["IN","DT"]:
        print("Skipped",term[0])
    else:
        print(term[0])
        tags.append(term[0])
        
tags
# 의미를 갖고있는 최소한의 단어를 뽑아냄!
Skipped the
little
yellow
dog
barked
Skipped at
Skipped the
persian
cat
Out[27]:
['little', 'yellow', 'dog', 'barked', 'persian', 'cat']
In [40]:
from nltk.tag import untag    # 무언가를 벗기겠다.

tagged = pos_tag(result)
untag(tagged)

[tag[0] for tag in tagged]
Out[40]:
['the', 'little', 'yellow', 'dog', 'barked', 'at', 'the', 'persian', 'cat']
In [44]:
# ["/".join(tag) for tag in tagged]

from nltk import Text
tagged = pos_tag(result)

textObj = Text(tagged)  # 튜플 형식
textObj.vocab().N()  # 몇 개의 term이 있는지
textObj.vocab().freq(('the','DT'))    #  확률값 
textObj.vocab().keys()  # 키들만 가져옴 
Out[44]:
dict_keys([('the', 'DT'), ('little', 'JJ'), ('yellow', 'JJ'), ('dog', 'NN'), ('barked', 'VBD'), ('at', 'IN'), ('persian', 'JJ'), ('cat', 'NN')])
In [45]:
for key in textObj.vocab().keys():
    if textObj.vocab().freq(key) > 0.2:
        print("Skipped", key)     # 확률이 0.2 이상이면 잘린다
Skipped ('the', 'DT')
In [71]:
from nltk.probability import FreqDist
from collections import defaultdict

taggedTerms = FreqDist()
# taggedTerms = defaultdict(int)      # 위나 아래나 똑같음 

for key in textObj.vocab().keys():
    if key[1].startswith("N"):
        taggedTerms[key] += 1

# Key = tuple(단어, 품사)   / join으로 (단어/품사) 만들 수 있다. 

for key in textObj.vocab().keys():
    if key[1].startswith("DT"):
        # key[1] in ["DT"]
        # re.search(r"DT.*")
        taggedTerms[key] += textObj.vocab().get(key)
        
taggedTerms, taggedTerms.N()
Out[71]:
(FreqDist({('the', 'DT'): 2, ('dog', 'NN'): 1, ('cat', 'NN'): 1}), 4)
In [80]:
from nltk.corpus import gutenberg

corpus = gutenberg.open(gutenberg.fileids()[0]).read()
corpus = pattern.sub("",corpus)
tokens = word_tokenize(corpus)  # 구두점 싹 다 날림 
tagged = pos_tag(tokens)  # ( 단어, 태그)

wordList = [pair[0] for pair in tagged]
posList =[pair[1] for pair in tagged]
In [81]:
len(tokens), len(tagged), len(wordList), len(posList)
Out[81]:
(158270, 158270, 158270, 158270)
In [106]:
# 3개 그래프 모양의 차이 
freqTokens = FreqDist()
for row in tokens: 
    freqTokens[row] += 1


freqTagged = FreqDist()
for row in tagged: # tuple(단어,품사)
    freqTagged[row] += 1
    
freqWord = FreqDist()
for row in wordList: # 단어
    freqWord[row] += 1
    
freqPos = FreqDist()
for row in posList: # 품사
    freqPos[row] += 1
In [92]:
freqTagged.most_common(10), freqWord.most_common(10),freqPos.most_common(10)
Out[92]:
([(('to', 'TO'), 5099),
  (('the', 'DT'), 4822),
  (('and', 'CC'), 4412),
  (('of', 'IN'), 4264),
  (('I', 'PRP'), 2968),
  (('a', 'DT'), 2962),
  (('was', 'VBD'), 2369),
  (('not', 'RB'), 2227),
  (('in', 'IN'), 2086),
  (('it', 'PRP'), 2041)],
 [('to', 5099),
  ('the', 4822),
  ('and', 4412),
  ('of', 4264),
  ('I', 2968),
  ('a', 2962),
  ('was', 2369),
  ('her', 2333),
  ('not', 2227),
  ('in', 2086)],
 [('NN', 18524),
  ('IN', 17875),
  ('PRP', 14866),
  ('RB', 12479),
  ('DT', 12276),
  ('JJ', 10305),
  ('NNP', 9863),
  ('VB', 9022),
  ('VBD', 8948),
  ('CC', 6445)])
In [108]:
# Zif's Law   :    순위에 반비례
from matplotlib import font_manager, rc

path='/Library/Fonts/AppleGothic.ttf'

family = font_manager.FontProperties(fname=path).get_name()
rc('font' ,family=family)

import matplotlib.pyplot as plt

x = range(1,51)
y = [pair[1] for pair in freqTagged.most_common(50)]
yy = [pair[1] for pair in freqWord.most_common(50)]
yyy = [pair[1] for pair in freqTokens.most_common(50)]
xx = range(1,11)
yyyy = [pair[1] for pair in freqPos.most_common(10)]


plt.plot(x,y,'r-')
plt.plot(x,yy,'b-')
plt.plot(x,yyy,'g-')
# 차원이 안맞아서 따로 그림 
# plt.plot(xx,yyyy,'g-')
plt.xlabel("단어 ( 단어, 품사 쌍)의 순위")
plt.ylabel("빈도")
plt.show()
In [109]:
# Heap's Law   : 전체 단어 중에 고유한 단어는 몇 개 있느냐를 보는 것 ( Unique Term )
# 나중에 봐보기
In [130]:
# 한글 데이터 이용 - Hannanum

from konlpy.tag import Hannanum

sen = "내 우리 그 이 나는 친구는 잠을 많이 잔다."

obj = Hannanum()
obj.morphs(sen) # 형태소 단위로 분석
obj.pos(sen) # +품사
# obj.tagset # 태그셋 확인 
obj.nouns(sen) # 명사만 찾아냄 
Out[130]:
['내', '우리', '나', '친구', '잠']
In [131]:
# 한글 데이터 이용 -  Kkma

from konlpy.tag import Kkma

sen = "내 우리 그 이 나는 친구는 잠을 많이 잔다."

obj = Kkma()
obj.morphs(sen) # 형태소 단위로 분석
obj.pos(sen) # +품사         /   여기서는 "많이"를 부사로 찾아내고 있음
# obj.tagset # 태그셋 확인 
obj.nouns(sen) # 명사만 찾아냄 
Out[131]:
['내', '우리', '이', '친구', '잠']
In [132]:
# 한글 데이터 이용 -  Komoran

from konlpy.tag import Komoran

sen = "내 우리 그 이 나는 친구는 잠을 많이 잔다."

obj = Komoran()
obj.morphs(sen) # 형태소 단위로 분석
obj.pos(sen) # +품사         /   여기서도 "많이"를 부사로 찾아내고 있음
# obj.tagset # 태그셋 확인 
obj.nouns(sen) # 명사만 찾아냄    / 하지만 명사리스트에서 "내"를 지웠음   ( "대명사"를 제외시킨 것 )
Out[132]:
['친구', '잠']
In [141]:
# 한글 데이터 이용 -  Okt

from konlpy.tag import Okt

sen = "내 우리 그 이 나는 친구는 잠을 많이 잔다."

obj = Okt()
obj.morphs(sen) # 형태소 단위로 분석
obj.pos(sen) # +품사         /   여기서도 "많이"를 부사로 찾아내고 있음
# obj.tagset # 태그셋 확인 
obj.nouns(sen) # 명사만 찾아냄    /   "대명사"  그대로 출력됨. 
Out[141]:
['내', '우리', '그', '이', '나', '친구', '잠']

형태소 분리 성능

Kkma > Komoran = Okt > Hannanum

품사 분석 성능

Kkma > Komoran = Okt > Hannanum

명사 분석 성능

Kkma = Hannanum = Okt => 대명사 포함(모든 명사) Komoran => 대명사 제거

  • Mecab이라는게 제일 좋지만 Windows에서는 돌지 않는다. 
  • Twitter가 upgrade된 것이, Okt이다. 

국립국어원 - 세종21

https://ithub.korean.go.kr/user/corpus/corpusSearchManager.do

국립국어원 - 말뭉치 - 찾기 - 형태분석

In [143]:
sentence="구해지지 않는 직장을 구하느니 고향친구 경자가 시집가느라 내놓은 인조꽃가게를 돈벌면 갚기로 하고 우선 넘겨받았다."

# obj = Okt()
# obj.morphs(sentence) # 형태소 단위로 분석
# obj.pos(sentence) # +품사         /   여기서도 "많이"를 부사로 찾아내고 있음
# # obj.tagset # 태그셋 확인 
# # obj.nouns(sentence) # 명사만 찾아냄    /   "대명사"  그대로 출력됨. 

obj = Kkma()
obj.morphs(sentence) # 형태소 단위로 분석
obj.pos(sentence) # +품사         /   여기서는 "많이"를 부사로 찾아내고 있음
# obj.tagset # 태그셋 확인 
# obj.nouns(sentence) # 명사만 찾아냄 
Out[143]:
[('구하', 'VV'),
 ('어', 'ECS'),
 ('지', 'VXV'),
 ('지', 'ECD'),
 ('않', 'VXV'),
 ('는', 'ETD'),
 ('직장', 'NNG'),
 ('을', 'JKO'),
 ('구하', 'VV'),
 ('느니', 'ECE'),
 ('고향', 'NNG'),
 ('친구', 'NNG'),
 ('경자', 'NNG'),
 ('가', 'JKS'),
 ('시집가', 'VV'),
 ('느라', 'ECD'),
 ('내놓', 'VV'),
 ('은', 'ETD'),
 ('인조', 'NNG'),
 ('꽃', 'NNG'),
 ('가게', 'NNG'),
 ('를', 'JKO'),
 ('돈', 'NNG'),
 ('벌', 'VV'),
 ('면', 'ECE'),
 ('갚', 'VV'),
 ('기로', 'ECD'),
 ('하', 'VV'),
 ('고', 'ECE'),
 ('우선', 'MAG'),
 ('넘겨받', 'VV'),
 ('았', 'EPT'),
 ('다', 'EFN'),
 ('.', 'SF')]

한글에 대해 다시 그래프 그리기

In [145]:
from konlpy.corpus import kolaw

corpus = kolaw.open(kolaw.fileids()[0]).read()
tokens = Kkma().pos(corpus)
In [148]:
# 형태소 분석하니 갯수가 엄청나게 늘어남...!
len(word_tokenize(corpus)), len(tokens), tokens[0]
Out[148]:
(4640, 10053, ('대한민국', 'NNG'))
In [151]:
rawList = FreqDist()
pairList = FreqDist()
wordList = FreqDist()
posList = FreqDist()

for term in word_tokenize(corpus):
    rawList[term] += 1

for pair in tokens:
    pairList[pair] += 1
    wordList[pair[0]] += 1
    posList[pair[1]] += 1
    
In [152]:
len(rawList), len(pairList), len(wordList), len(posList)
Out[152]:
(2023, 1294, 1247, 38)
In [159]:
# Zif's Law   :    순위에 반비례
import matplotlib.pyplot as plt
import math

x = range(1,51)
y = [pair[1] for pair in rawList.most_common(50)]
yy = [pair[1] for pair in pairList.most_common(50)]
yyy = [pair[1] for pair in wordList.most_common(50)]
# 품사
xx = range(1,11)
yyyy = [pair[1] for pair in posList.most_common(10)]


_y = [math.log10(_) for _ in y]
_yy = [math.log10(_) for _ in yy]
_yyy = [math.log10(_) for _ in yyy]

plt.plot(x,y,'r-')
plt.plot(x,yy,'b-')
plt.plot(x,yyy,'g-')


# 차원이 안맞아서 따로 그림 
# plt.plot(xx,yyyy,'g-')
plt.xlabel("단어 ( 단어, 품사 쌍)의 순위")
plt.ylabel("빈도")
plt.show()

# 형태소 단위로 분리하다 보니까 차이가 큼 


plt.plot(x,_y,'r-')
plt.plot(x,_yy,'b-')
plt.plot(x,_yyy,'g-')
plt.xlabel("단어 ( 단어, 품사 쌍)의 순위")
plt.ylabel("빈도")
plt.show()

# 더 일반화된 규칙을 따른다고 볼 수 있다. 
# 아래와 같은 그래프 모양을 만들어야한다. raw Text 바로 사용 X

수집한 뉴스데이터에 적용

In [174]:
# Zif's Law   :    순위에 반비례
import matplotlib.pyplot as plt
import math
import os

corpus = ""
for file in os.listdir("./News"):
    if file.endswith(".txt"):
        with open("./News/" + file, encoding='utf-8') as f:
            corpus += f.read()

rawList = FreqDist()
pairList = FreqDist()
wordList = FreqDist()
posList = FreqDist()

for term in word_tokenize(corpus):
    rawList[term] += 1

tokens = Kkma().pos(corpus)  

for pair in tokens:
    pairList[pair] += 1
    wordList[pair[0]] += 1
    posList[pair[1]] += 1
    

x = range(1,51)
y = [pair[1] for pair in rawList.most_common(50)]
yy = [pair[1] for pair in pairList.most_common(50)]
yyy = [pair[1] for pair in wordList.most_common(50)]
# 품사
xx = range(1,11)
yyyy = [pair[1] for pair in posList.most_common(10)]


_y = [math.log10(_) for _ in y]
_yy = [math.log10(_) for _ in yy]
_yyy = [math.log10(_) for _ in yyy]

plt.plot(x,y,'r-')
plt.plot(x,yy,'b-')
plt.plot(x,yyy,'g-')


# 차원이 안맞아서 따로 그림 
# plt.plot(xx,yyyy,'g-')
plt.xlabel("단어 ( 단어, 품사 쌍)의 순위")
plt.ylabel("빈도")
plt.show()

# 형태소 단위로 분리하다 보니까 차이가 큼 


plt.plot(x,_y,'r-')
plt.plot(x,_yy,'b-')
plt.plot(x,_yyy,'g-')
plt.xlabel("단어 ( 단어, 품사 쌍)의 순위")
plt.ylabel("빈도")
plt.show()

# log했을 때, 반비례로 매우 잘 나타난 모습을 볼 수 있다. 

+ n-gram 적용

In [164]:
def ngramEojeol(sentence, n=2):
#     입력 : 단어1, 단어2, 단어3, 단어4 : 4
#     출력(2) : 단어12, 단어23, 단어34 : 3 - n + 1
#     출력(3) : 단어123, 단어234 : 2 - n + 1 
    tokens =sentence.split()
    ngram = []
    
    for i in range(len(tokens) - n + 1):
        ngram.append(" ".join(tokens[i:i+n]))
    
    return ngram
In [165]:
def ngramUmjeol(term, n=2):
#     입력 : 단어1, 단어2, 단어3, 단어4 : 4
#     출력(2) : 단어12, 단어23, 단어34 : 3 - n + 1
#     출력(3) : 단어123, 단어234 : 2 - n + 1 
    ngram = []
    
    for i in range(len(term) - n + 1):
#          ngram.append(tuple(term[i:i+n]))
        ngram.append("".join(tuple(term[i:i+n])))   # 음절을 붙임
    return ngram
In [180]:
ngramList = FreqDist()

for term in word_tokenize(corpus):
    for token in ngramUmjeol(term):
        ngramList[token] += 1
        


x = range(1,51)
y = [pair[1] for pair in rawList.most_common(50)]
yy = [pair[1] for pair in pairList.most_common(50)]
yyy = [pair[1] for pair in wordList.most_common(50)]
# 품사
xx = range(1,11)
yyyy = [pair[1] for pair in posList.most_common(10)]


_y = [math.log10(_) for _ in y]
_yy = [math.log10(_) for _ in yy]
_yyy = [math.log10(_) for _ in yyy]


ngramY = [pair[1] for pair in ngramList.most_common(50)]
ngramYY = [math.log10(_) for _ in ngramY]


plt.plot(x,y,'r-')
plt.plot(x,yy,'b-')
plt.plot(x,yyy,'g-')
plt.plot(x,ngramY,'y-')

# 차원이 안맞아서 따로 그림 
# plt.plot(xx,yyyy,'g-')
plt.xlabel("단어 ( 단어, 품사 쌍)의 순위")
plt.ylabel("빈도")
plt.show()

plt.plot(x,_y,'r-')
plt.plot(x,_yy,'b-')
plt.plot(x,_yyy,'g-')
plt.plot(x,ngramYY,'y-')
# plt.plot(x,ngramYY,'g-')
plt.xlabel("단어 ( 단어, 품사 쌍)의 순위")
plt.ylabel("빈도")
plt.show()

Collocations

단어의 특정한 열이다.

Occurance가 가장 높은 애들 / 한 문맥, 문장안에 다른 위치에 있어도 같이 많이 나온 애들

단어, 품사 쌍

In [211]:
from nltk.collocations import BigramAssocMeasures, BigramCollocationFinder
from nltk import pos_tag
from nltk.corpus import gutenberg
from nltk.tokenize import word_tokenize

corpus = gutenberg.open(gutenberg.fileids()[0]).read()
tokens = word_tokenize(corpus)
result = pos_tag(tokens)
tokens = [row for row in result if len(row[0]) > 2]  # 형태소 분석기까지 돌리기 ! 


# 쌍을 찾을 것
bigram = BigramCollocationFinder.from_words(tokens)

# pmi 가지고 Best 쌍 찾기
bigram.nbest(BigramAssocMeasures().pmi, 10)
# chi_sq 가지고 Best 쌍 찾기
# 단어를 다 별도로 인식했을 것. 
bigram.nbest(BigramAssocMeasures().chi_sq, 10)
Out[211]:
[(('26th', 'CD'), ('ult.', 'NN')),
 (('Abominable', 'JJ'), ('scoundrel', 'NN')),
 (('Agricultural', 'NNP'), ('Reports', 'NNP')),
 (('Always', 'NNP'), ('deceived', 'VBD')),
 (('Austen', 'NNP'), ('1816', 'CD')),
 (('Baronne', 'NNP'), ("d'Almane", 'NN')),
 (('Books', 'NNP'), ('engravings', 'NNS')),
 (('Candles', 'NNP'), ('everywhere.', 'VBP')),
 (('Clayton', 'NNP'), ('Park', 'NNP')),
 (('Comtesse', 'NNP'), ("d'Ostalis", 'NN'))]

품사 쌍

In [208]:
from nltk.collocations import BigramAssocMeasures, BigramCollocationFinder
from nltk import pos_tag
from nltk.corpus import gutenberg
from nltk.tokenize import word_tokenize

corpus = gutenberg.open(gutenberg.fileids()[0]).read()
tokens = word_tokenize(corpus)
result = pos_tag(tokens)
tokens = [row[1] for row in result if len(row[0]) > 2]  # 형태소 분석기까지 돌리기 ! 


# 쌍을 찾을 것
bigram = BigramCollocationFinder.from_words(tokens)

# pmi 가지고 Best 쌍 찾기
bigram.nbest(BigramAssocMeasures().pmi, 10)
# chi_sq 가지고 Best 쌍 찾기
# 단어를 다 별도로 인식했을 것. 
bigram.nbest(BigramAssocMeasures().chi_sq, 10)
Out[208]:
[('NNP', 'NNP'),
 ('DT', 'NN'),
 ('MD', 'VB'),
 ('JJ', 'NN'),
 ('PRP', 'VBD'),
 ('PRP$', 'NN'),
 ('MD', 'RB'),
 ('PRP', 'VBP'),
 ('VBD', 'VBN'),
 ('PRP', 'MD')]
In [212]:
# 우리가 찾아낸 것과 비슷
bigram.ngram_fd.most_common()
Out[212]:
[((('had', 'VBD'), ('been', 'VBN')), 308),
 ((('could', 'MD'), ('not', 'RB')), 277),
 ((('Mr.', 'NNP'), ('Knightley', 'NNP')), 274),
 ((('and', 'CC'), ('the', 'DT')), 257),
 ((('she', 'PRP'), ('had', 'VBD')), 254),
 ((('Mrs.', 'NNP'), ('Weston', 'NNP')), 246),
 ((('she', 'PRP'), ('was', 'VBD')), 218),
 ((('Mr.', 'NNP'), ('Elton', 'NNP')), 211),
 ((('that', 'IN'), ('she', 'PRP')), 186),
 ((('did', 'VBD'), ('not', 'RB')), 179),
 ((('for', 'IN'), ('the', 'DT')), 172),
 ((('Miss', 'NNP'), ('Woodhouse', 'NNP')), 167),
 ((('any', 'DT'), ('thing', 'NN')), 163),
 ((('all', 'PDT'), ('the', 'DT')), 161),
 ((('was', 'VBD'), ('not', 'RB')), 161),
 ((('Mr.', 'NNP'), ('Weston', 'NNP')), 158),
 ((('have', 'VB'), ('been', 'VBN')), 156),
 ((('would', 'MD'), ('not', 'RB')), 144),
 ((('Frank', 'NNP'), ('Churchill', 'NNP')), 144),
 ((('Mrs.', 'NNP'), ('Elton', 'NNP')), 140),
 ((('can', 'MD'), ('not', 'RB')), 139),
 ((('she', 'PRP'), ('could', 'MD')), 138),
 ((('would', 'MD'), ('have', 'VB')), 137),
 ((('Mr.', 'NNP'), ('Woodhouse', 'NNP')), 131),
 ((('and', 'CC'), ('she', 'PRP')), 129),
 ((('her', 'PRP$'), ('own', 'JJ')), 124),
 ((('with', 'IN'), ('the', 'DT')), 122),
 ((('Miss', 'NNP'), ('Fairfax', 'NNP')), 121),
 ((('for', 'IN'), ('her', 'PRP$')), 116),
 ((('every', 'DT'), ('thing', 'NN')), 116),
 ((('had', 'VBD'), ('not', 'RB')), 113),
 ((('was', 'VBD'), ('the', 'DT')), 111),
 ((('She', 'PRP'), ('was', 'VBD')), 110),
 ((('Miss', 'NNP'), ('Bates', 'NNP')), 107),
 ((('not', 'RB'), ('have', 'VB')), 106),
 ((('Jane', 'NNP'), ('Fairfax', 'NNP')), 104),
 ((('was', 'VBD'), ('very', 'RB')), 103),
 ((('you', 'PRP'), ('know', 'VBP')), 98),
 ((('the', 'DT'), ('same', 'JJ')), 98),
 ((('there', 'EX'), ('was', 'VBD')), 95),
 ((('from', 'IN'), ('the', 'DT')), 94),
 ((('every', 'DT'), ('body', 'NN')), 92),
 ((('and', 'CC'), ('Mrs.', 'NNP')), 92),
 ((('and', 'CC'), ('was', 'VBD')), 89),
 ((('the', 'DT'), ('very', 'RB')), 89),
 ((('you', 'PRP'), ('are', 'VBP')), 89),
 ((('have', 'VBP'), ('been', 'VBN')), 88),
 ((('young', 'JJ'), ('man', 'NN')), 83),
 ((('her', 'PRP$'), ('father', 'NN')), 81),
 ((('not', 'RB'), ('know', 'VB')), 81),
 ((('and', 'CC'), ('Mr.', 'NNP')), 79),
 ((('you', 'PRP'), ('have', 'VBP')), 79),
 ((('She', 'PRP'), ('had', 'VBD')), 78),
 ((('his', 'PRP$'), ('own', 'JJ')), 78),
 ((('will', 'MD'), ('not', 'RB')), 77),
 ((('any', 'DT'), ('body', 'NN')), 77),
 ((('the', 'DT'), ('world', 'NN')), 75),
 ((('they', 'PRP'), ('were', 'VBD')), 75),
 ((('but', 'CC'), ('she', 'PRP')), 73),
 ((('the', 'DT'), ('other', 'JJ')), 73),
 ((('and', 'CC'), ('her', 'PRP$')), 71),
 ((('and', 'CC'), ('Emma', 'NNP')), 71),
 ((('and', 'CC'), ('that', 'IN')), 71),
 ((('more', 'JJR'), ('than', 'IN')), 70),
 ((('she', 'PRP'), ('would', 'MD')), 70),
 ((('the', 'DT'), ('first', 'JJ')), 69),
 ((('and', 'CC'), ('Miss', 'NNP')), 68),
 ((('and', 'CC'), ('his', 'PRP$')), 68),
 ((('Emma', 'NNP'), ('was', 'VBD')), 67),
 ((('must', 'MD'), ('have', 'VB')), 67),
 ((('which', 'WDT'), ('she', 'PRP')), 67),
 ((('you', 'PRP'), ('will', 'MD')), 66),
 ((('that', 'IN'), ('you', 'PRP')), 65),
 ((('great', 'JJ'), ('deal', 'NN')), 64),
 ((('said', 'VBD'), ('Emma', 'NNP')), 64),
 ((('him', 'PRP'), ('and', 'CC')), 64),
 ((('does', 'VBZ'), ('not', 'RB')), 63),
 ((('and', 'CC'), ('very', 'RB')), 61),
 ((('the', 'DT'), ('subject', 'NN')), 61),
 ((('that', 'IN'), ('the', 'DT')), 61),
 ((('Emma', 'NNP'), ('could', 'MD')), 60),
 ((('with', 'IN'), ('her', 'PRP$')), 60),
 ((('should', 'MD'), ('not', 'RB')), 59),
 ((('her', 'PRP$'), ('and', 'CC')), 59),
 ((('not', 'RB'), ('think', 'VB')), 58),
 ((('has', 'VBZ'), ('been', 'VBN')), 58),
 ((('said', 'VBD'), ('she', 'PRP')), 58),
 ((('very', 'RB'), ('well', 'RB')), 57),
 ((('but', 'CC'), ('the', 'DT')), 56),
 ((('You', 'PRP'), ('are', 'VBP')), 56),
 ((('Miss', 'NNP'), ('Smith', 'NNP')), 56),
 ((('John', 'NNP'), ('Knightley', 'NNP')), 55),
 ((('and', 'CC'), ('had', 'VBD')), 54),
 ((('not', 'RB'), ('the', 'DT')), 54),
 ((('that', 'IN'), ('had', 'VBD')), 54),
 ((('should', 'MD'), ('have', 'VB')), 54),
 ((('and', 'CC'), ('then', 'RB')), 53),
 ((('the', 'DT'), ('most', 'RBS')), 53),
 ((('that', 'IN'), ('was', 'VBD')), 52),
 ((('might', 'MD'), ('have', 'VB')), 52),
 ((('Mrs.', 'NNP'), ('Goddard', 'NNP')), 52),
 ((('very', 'RB'), ('much', 'JJ')), 51),
 ((('all', 'DT'), ('her', 'PRP$')), 50),
 ((('have', 'VBP'), ('not', 'RB')), 49),
 ((('into', 'IN'), ('the', 'DT')), 49),
 ((('Mr.', 'NNP'), ('Frank', 'NNP')), 49),
 ((('who', 'WP'), ('had', 'VBD')), 48),
 ((('but', 'CC'), ('was', 'VBD')), 48),
 ((('and', 'CC'), ('with', 'IN')), 48),
 ((('what', 'WP'), ('she', 'PRP')), 47),
 ((('she', 'PRP'), ('did', 'VBD')), 47),
 ((('been', 'VBN'), ('the', 'DT')), 47),
 ((('could', 'MD'), ('have', 'VB')), 47),
 ((('Miss', 'NNP'), ('Taylor', 'NNP')), 46),
 ((('the', 'DT'), ('house', 'NN')), 46),
 ((('very', 'RB'), ('much', 'RB')), 46),
 ((('you', 'PRP'), ('would', 'MD')), 46),
 ((('and', 'CC'), ('when', 'WRB')), 46),
 ((('with', 'IN'), ('her', 'PRP')), 45),
 ((('very', 'RB'), ('good', 'JJ')), 44),
 ((('what', 'WP'), ('you', 'PRP')), 44),
 ((('assure', 'VBP'), ('you', 'PRP')), 44),
 ((('the', 'DT'), ('two', 'CD')), 43),
 ((('you', 'PRP'), ('must', 'MD')), 43),
 ((('with', 'IN'), ('him', 'PRP')), 43),
 ((('had', 'VBD'), ('never', 'RB')), 43),
 ((('There', 'EX'), ('was', 'VBD')), 43),
 ((('was', 'VBD'), ('quite', 'RB')), 43),
 ((('you', 'PRP'), ('were', 'VBD')), 43),
 ((('from', 'IN'), ('her', 'PRP$')), 42),
 ((('Emma', 'NNP'), ('had', 'VBD')), 42),
 ((('her', 'PRP$'), ('friend', 'NN')), 41),
 ((('Mr.', 'NNP'), ('and', 'CC')), 41),
 ((('said', 'VBD'), ('Mr.', 'NNP')), 41),
 ((('they', 'PRP'), ('are', 'VBP')), 41),
 ((('the', 'DT'), ('whole', 'JJ')), 41),
 ((('the', 'DT'), ('time', 'NN')), 41),
 ((('and', 'CC'), ('Harriet', 'NNP')), 41),
 ((('the', 'DT'), ('room', 'NN')), 41),
 ((('again', 'RB'), ('and', 'CC')), 41),
 ((('but', 'CC'), ('not', 'RB')), 40),
 ((('not', 'RB'), ('been', 'VBN')), 40),
 ((('You', 'PRP'), ('will', 'MD')), 40),
 ((('too', 'RB'), ('much', 'JJ')), 39),
 ((('say', 'VB'), ('that', 'IN')), 39),
 ((('than', 'IN'), ('she', 'PRP')), 39),
 ((('dare', 'VBP'), ('say', 'VB')), 39),
 ((('Mrs.', 'NNP'), ('Churchill', 'NNP')), 39),
 ((('his', 'PRP$'), ('father', 'NN')), 39),
 ((('the', 'DT'), ('young', 'JJ')), 39),
 ((('the', 'DT'), ('best', 'JJS')), 38),
 ((('the', 'DT'), ('rest', 'NN')), 38),
 ((('that', 'IN'), ('Mr.', 'NNP')), 38),
 ((('you', 'PRP'), ('and', 'CC')), 38),
 ((('and', 'CC'), ('every', 'DT')), 38),
 ((('for', 'IN'), ('his', 'PRP$')), 38),
 ((('must', 'MD'), ('not', 'RB')), 38),
 ((('not', 'RB'), ('you', 'PRP')), 38),
 ((('and', 'CC'), ('not', 'RB')), 38),
 ((('Weston', 'NNP'), ('was', 'VBD')), 37),
 ((('for', 'IN'), ('him', 'PRP')), 37),
 ((('Mr.', 'NNP'), ('Martin', 'NNP')), 37),
 ((('when', 'WRB'), ('she', 'PRP')), 36),
 ((('she', 'PRP'), ('must', 'MD')), 36),
 ((('for', 'IN'), ('any', 'DT')), 36),
 ((('that', 'IN'), ('should', 'MD')), 36),
 ((('was', 'VBD'), ('all', 'DT')), 36),
 ((('Mr.', 'NNP'), ('Perry', 'NNP')), 36),
 ((('they', 'PRP'), ('had', 'VBD')), 35),
 ((('the', 'DT'), ('evening', 'NN')), 35),
 ((('are', 'VBP'), ('not', 'RB')), 35),
 ((('the', 'DT'), ('idea', 'NN')), 35),
 ((('that', 'IN'), ('they', 'PRP')), 34),
 ((('which', 'WDT'), ('had', 'VBD')), 34),
 ((('them', 'PRP'), ('and', 'CC')), 34),
 ((('with', 'IN'), ('you', 'PRP')), 34),
 ((('some', 'DT'), ('time', 'NN')), 34),
 ((('about', 'IN'), ('the', 'DT')), 34),
 ((('what', 'WP'), ('was', 'VBD')), 34),
 ((('Harriet', 'NNP'), ('was', 'VBD')), 34),
 ((('which', 'WDT'), ('was', 'VBD')), 34),
 ((('and', 'CC'), ('though', 'IN')), 33),
 ((('with', 'IN'), ('them', 'PRP')), 33),
 ((('she', 'PRP'), ('might', 'MD')), 33),
 ((('for', 'IN'), ('you', 'PRP')), 33),
 ((('Knightley', 'NNP'), ('and', 'CC')), 33),
 ((('are', 'VBP'), ('very', 'RB')), 33),
 ((('She', 'PRP'), ('could', 'MD')), 33),
 ((('Weston', 'NNP'), ('and', 'CC')), 32),
 ((('had', 'VBD'), ('the', 'DT')), 32),
 ((('was', 'VBD'), ('now', 'RB')), 31),
 ((('with', 'IN'), ('Mr.', 'NNP')), 31),
 ((('love', 'NN'), ('with', 'IN')), 31),
 ((('but', 'CC'), ('you', 'PRP')), 31),
 ((('that', 'IN'), ('there', 'EX')), 31),
 ((('Robert', 'NNP'), ('Martin', 'NNP')), 31),
 ((('Elton', 'NNP'), ('was', 'VBD')), 31),
 ((('she', 'PRP'), ('has', 'VBZ')), 31),
 ((('you', 'PRP'), ('had', 'VBD')), 31),
 ((('Maple', 'NNP'), ('Grove', 'NNP')), 31),
 ((('not', 'RB'), ('all', 'DT')), 30),
 ((('she', 'PRP'), ('will', 'MD')), 30),
 ((('But', 'CC'), ('you', 'PRP')), 30),
 ((('one', 'CD'), ('the', 'DT')), 30),
 ((('his', 'PRP$'), ('wife', 'NN')), 30),
 ((('the', 'DT'), ('day', 'NN')), 30),
 ((('Mrs.', 'NNP'), ('Cole', 'NNP')), 30),
 ((('the', 'DT'), ('next', 'JJ')), 29),
 ((('might', 'MD'), ('not', 'RB')), 29),
 ((('for', 'IN'), ('Mr.', 'NNP')), 29),
 ((('and', 'CC'), ('must', 'MD')), 29),
 ((('Mrs.', 'NNP'), ('Bates', 'NNP')), 29),
 ((('with', 'IN'), ('all', 'PDT')), 29),
 ((('with', 'IN'), ('such', 'JJ')), 29),
 ((('Mr.', 'NNP'), ('John', 'NNP')), 29),
 ((('the', 'DT'), ('party', 'NN')), 29),
 ((('dear', 'JJ'), ('Emma', 'NNP')), 29),
 ((('the', 'DT'), ('carriage', 'NN')), 28),
 ((('were', 'VBD'), ('not', 'RB')), 28),
 ((('and', 'CC'), ('could', 'MD')), 28),
 ((('Harriet', 'NNP'), ('Smith', 'NNP')), 28),
 ((('They', 'PRP'), ('were', 'VBD')), 28),
 ((('with', 'IN'), ('his', 'PRP$')), 28),
 ((('the', 'DT'), ('least', 'JJS')), 28),
 ((('very', 'RB'), ('little', 'JJ')), 27),
 ((('and', 'CC'), ('they', 'PRP')), 27),
 ((('you', 'PRP'), ('may', 'MD')), 27),
 ((('the', 'DT'), ('door', 'NN')), 27),
 ((('the', 'DT'), ('only', 'JJ')), 27),
 ((('any', 'DT'), ('other', 'JJ')), 27),
 ((('its', 'PRP$'), ('being', 'VBG')), 27),
 ((('know', 'VB'), ('what', 'WP')), 27),
 ((('has', 'VBZ'), ('not', 'RB')), 27),
 ((('them', 'PRP'), ('the', 'DT')), 27),
 ((('was', 'VBD'), ('too', 'RB')), 27),
 ((('she', 'PRP'), ('should', 'MD')), 27),
 ((('cried', 'VBD'), ('Emma', 'NNP')), 27),
 ((('Colonel', 'NNP'), ('Campbell', 'NNP')), 27),
 ((('were', 'VBD'), ('the', 'DT')), 26),
 ((('and', 'CC'), ('there', 'EX')), 26),
 ((('that', 'IN'), ('her', 'PRP$')), 26),
 ((('not', 'RB'), ('but', 'CC')), 26),
 ((('Emma', 'NNP'), ('and', 'CC')), 26),
 ((('the', 'DT'), ('greatest', 'JJS')), 26),
 ((('her', 'PRP'), ('and', 'CC')), 26),
 ((('and', 'CC'), ('for', 'IN')), 26),
 ((('that', 'IN'), ('could', 'MD')), 26),
 ((('you', 'PRP'), ('should', 'MD')), 26),
 ((('she', 'PRP'), ('thought', 'VBD')), 26),
 ((('this', 'DT'), ('morning', 'NN')), 26),
 ((('not', 'RB'), ('very', 'RB')), 26),
 ((('father', 'NN'), ('and', 'CC')), 25),
 ((('and', 'CC'), ('you', 'PRP')), 25),
 ((('and', 'CC'), ('will', 'MD')), 25),
 ((('and', 'CC'), ('therefore', 'RB')), 25),
 ((('you', 'PRP'), ('can', 'MD')), 25),
 ((('was', 'VBD'), ('most', 'RBS')), 25),
 ((('see', 'VB'), ('him', 'PRP')), 25),
 ((('She', 'PRP'), ('would', 'MD')), 25),
 ((('This', 'DT'), ('was', 'VBD')), 25),
 ((('shall', 'MD'), ('not', 'RB')), 25),
 ((('her', 'PRP$'), ('mind', 'NN')), 25),
 ((('them', 'PRP'), ('all', 'DT')), 25),
 ((('Fairfax', 'NNP'), ('and', 'CC')), 25),
 ((('was', 'VBD'), ('always', 'RB')), 24),
 ((('had', 'VBD'), ('done', 'VBN')), 24),
 ((('tell', 'VB'), ('you', 'PRP')), 24),
 ((('the', 'DT'), ('pleasure', 'NN')), 24),
 ((('him', 'PRP'), ('the', 'DT')), 24),
 ((('Weston', 'NNP'), ('had', 'VBD')), 24),
 ((('have', 'VBP'), ('heard', 'VBN')), 24),
 ((('his', 'PRP$'), ('being', 'VBG')), 24),
 ((('those', 'DT'), ('who', 'WP')), 24),
 ((('the', 'DT'), ('present', 'JJ')), 24),
 ((('was', 'VBD'), ('obliged', 'VBN')), 24),
 ((('young', 'JJ'), ('woman', 'NN')), 24),
 ((('nothing', 'NN'), ('but', 'CC')), 24),
 ((('her', 'PRP$'), ('the', 'DT')), 24),
 ((('the', 'DT'), ('Crown', 'NNP')), 24),
 ((('the', 'DT'), ('last', 'JJ')), 23),
 ((('each', 'DT'), ('other', 'JJ')), 23),
 ((('the', 'DT'), ('place', 'NN')), 23),
 ((('young', 'JJ'), ('lady', 'NN')), 23),
 ((('know', 'VB'), ('that', 'IN')), 23),
 ((('Elton', 'NNP'), ('and', 'CC')), 23),
 ((('the', 'DT'), ('children', 'NNS')), 23),
 ((('and', 'CC'), ('yet', 'RB')), 23),
 ((('and', 'CC'), ('all', 'PDT')), 23),
 ((('body', 'NN'), ('else', 'RB')), 23),
 ((('and', 'CC'), ('only', 'RB')), 23),
 ((('and', 'CC'), ('Jane', 'NNP')), 23),
 ((('and', 'CC'), ('now', 'RB')), 22),
 ((('have', 'VB'), ('the', 'DT')), 22),
 ((('when', 'WRB'), ('the', 'DT')), 22),
 ((('but', 'CC'), ('had', 'VBD')), 22),
 ((('when', 'WRB'), ('they', 'PRP')), 22),
 ((('she', 'PRP'), ('found', 'VBD')), 22),
 ((('for', 'IN'), ('Harriet', 'NNP')), 22),
 ((('Harriet', 'NNP'), ('had', 'VBD')), 22),
 ((('had', 'VBD'), ('ever', 'RB')), 22),
 ((('your', 'PRP$'), ('own', 'JJ')), 22),
 ((('you', 'PRP'), ('not', 'RB')), 22),
 ((('not', 'RB'), ('quite', 'RB')), 22),
 ((('the', 'DT'), ('others', 'NNS')), 22),
 ((('She', 'PRP'), ('must', 'MD')), 22),
 ((('enough', 'RB'), ('for', 'IN')), 22),
 ((('Mr.', 'NNP'), ('Cole', 'NNP')), 22),
 ((('the', 'DT'), ('Campbells', 'NNP')), 22),
 ((('and', 'CC'), ('how', 'WRB')), 21),
 ((('her', 'PRP$'), ('husband', 'NN')), 21),
 ((('her', 'PRP$'), ('that', 'IN')), 21),
 ((('she', 'PRP'), ('knew', 'VBD')), 21),
 ((('that', 'IN'), ('Emma', 'NNP')), 21),
 ((('was', 'VBD'), ('his', 'PRP$')), 21),
 ((('the', 'DT'), ('way', 'NN')), 21),
 ((('him', 'PRP'), ('for', 'IN')), 21),
 ((('not', 'RB'), ('want', 'VB')), 21),
 ((('good', 'JJ'), ('deal', 'NN')), 21),
 ((('acquainted', 'VBN'), ('with', 'IN')), 21),
 ((('been', 'VBN'), ('very', 'RB')), 21),
 ((('and', 'CC'), ('after', 'IN')), 21),
 ((('You', 'PRP'), ('must', 'MD')), 21),
 ((('and', 'CC'), ('said', 'VBD')), 21),
 ((('being', 'VBG'), ('the', 'DT')), 21),
 ((('but', 'CC'), ('there', 'EX')), 21),
 ((('thing', 'NN'), ('the', 'DT')), 21),
 ((('upon', 'IN'), ('the', 'DT')), 21),
 ((('and', 'CC'), ('all', 'DT')), 21),
 ((('from', 'IN'), ('any', 'DT')), 21),
 ((('Mr.', 'NNP'), ('Dixon', 'NNP')), 21),
 ((('through', 'IN'), ('the', 'DT')), 20),
 ((('with', 'IN'), ('all', 'DT')), 20),
 ((('not', 'RB'), ('say', 'VB')), 20),
 ((('have', 'VB'), ('done', 'VBN')), 20),
 ((('than', 'IN'), ('any', 'DT')), 20),
 ((('the', 'DT'), ('letter', 'NN')), 20),
 ((('know', 'VB'), ('how', 'WRB')), 20),
 ((('Woodhouse', 'NNP'), ('was', 'VBD')), 20),
 ((('Hartfield', 'NNP'), ('and', 'CC')), 20),
 ((('with', 'IN'), ('any', 'DT')), 20),
 ((('Bates', 'NNP'), ('and', 'CC')), 20),
 ((('before', 'IN'), ('the', 'DT')), 20),
 ((('before', 'IN'), ('she', 'PRP')), 20),
 ((('well', 'RB'), ('and', 'CC')), 20),
 ((('but', 'CC'), ('for', 'IN')), 20),
 ((('were', 'VBD'), ('all', 'DT')), 20),
 ((('few', 'JJ'), ('minutes', 'NNS')), 20),
 ((('her', 'PRP'), ('the', 'DT')), 20),
 ((('not', 'RB'), ('help', 'VB')), 20),
 ((('the', 'DT'), ('smallest', 'JJS')), 20),
 ((('the', 'DT'), ('sort', 'NN')), 20),
 ((('this', 'DT'), ('moment', 'NN')), 20),
 ((('him', 'PRP'), ('that', 'IN')), 20),
 ((('very', 'RB'), ('great', 'JJ')), 20),
 ((('whom', 'WP'), ('she', 'PRP')), 19),
 ((('for', 'IN'), ('ever', 'RB')), 19),
 ((('his', 'PRP$'), ('daughter', 'NN')), 19),
 ((('may', 'MD'), ('not', 'RB')), 19),
 ((('you', 'PRP'), ('for', 'IN')), 19),
 ((('much', 'JJ'), ('the', 'DT')), 19),
 ((('are', 'VBP'), ('you', 'PRP')), 19),
 ((('him', 'PRP'), ('but', 'CC')), 19),
 ((('never', 'RB'), ('been', 'VBN')), 19),
 ((('him', 'PRP'), ('very', 'RB')), 19),
 ((('all', 'DT'), ('and', 'CC')), 19),
 ((('out', 'IN'), ('the', 'DT')), 19),
 ((('all', 'DT'), ('that', 'IN')), 19),
 ((('she', 'PRP'), ('saw', 'VBD')), 19),
 ((('very', 'RB'), ('happy', 'JJ')), 19),
 ((('were', 'VBD'), ('very', 'RB')), 19),
 ((('you', 'PRP'), ('could', 'MD')), 19),
 ((('sure', 'JJ'), ('you', 'PRP')), 19),
 ((('said', 'VBD'), ('Mrs.', 'NNP')), 19),
 ((('Harriet', 'NNP'), ('and', 'CC')), 19),
 ((('all', 'PDT'), ('that', 'IN')), 19),
 ((('have', 'VB'), ('thought', 'VBN')), 19),
 ((('which', 'WDT'), ('must', 'MD')), 19),
 ((('Emma', 'NNP'), ('felt', 'VBD')), 19),
 ((('will', 'MD'), ('soon', 'RB')), 19),
 ((('her', 'PRP$'), ('being', 'VBG')), 19),
 ((('had', 'VBD'), ('seen', 'VBN')), 19),
 ((('her', 'PRP'), ('with', 'IN')), 19),
 ((('the', 'DT'), ('whole', 'NN')), 19),
 ((('Mr.', 'NNP'), ('Churchill', 'NNP')), 19),
 ((('home', 'NN'), ('and', 'CC')), 18),
 ((('and', 'CC'), ('who', 'WP')), 18),
 ((('but', 'CC'), ('Emma', 'NNP')), 18),
 ((('and', 'CC'), ('would', 'MD')), 18),
 ((('but', 'CC'), ('when', 'WRB')), 18),
 ((('have', 'VBP'), ('never', 'RB')), 18),
 ((('very', 'RB'), ('soon', 'RB')), 18),
 ((('when', 'WRB'), ('you', 'PRP')), 18),
 ((('very', 'RB'), ('kind', 'NN')), 18),
 ((('half', 'PDT'), ('hour', 'NN')), 18),
 ((('you', 'PRP'), ('all', 'DT')), 18),
 ((('regard', 'NN'), ('for', 'IN')), 18),
 ((('felt', 'VBD'), ('that', 'IN')), 18),
 ((('not', 'RB'), ('allow', 'VB')), 18),
 ((('would', 'MD'), ('never', 'RB')), 18),
 ((('that', 'IN'), ('would', 'MD')), 18),
 ((('and', 'CC'), ('have', 'VBP')), 18),
 ((('which', 'WDT'), ('the', 'DT')), 18),
 ((('though', 'IN'), ('she', 'PRP')), 18),
 ((('but', 'CC'), ('still', 'RB')), 18),
 ((('was', 'VBD'), ('little', 'JJ')), 18),
 ((('with', 'IN'), ('Mrs.', 'NNP')), 18),
 ((('him', 'PRP'), ('was', 'VBD')), 18),
 ((('but', 'CC'), ('Mr.', 'NNP')), 18),
 ((('need', 'VBP'), ('not', 'RB')), 18),
 ((('you', 'PRP'), ('see', 'VBP')), 18),
 ((('any', 'DT'), ('one', 'CD')), 18),
 ((('the', 'DT'), ('case', 'NN')), 18),
 ((('may', 'MD'), ('have', 'VB')), 18),
 ((('not', 'RB'), ('mean', 'VB')), 18),
 ((('there', 'EX'), ('would', 'MD')), 18),
 ((('her', 'PRP$'), ('with', 'IN')), 18),
 ((('you', 'PRP'), ('think', 'VB')), 18),
 ((('that', 'WDT'), ('could', 'MD')), 18),
 ((('that', 'IN'), ('Harriet', 'NNP')), 18),
 ((('who', 'WP'), ('have', 'VBP')), 18),
 ((('would', 'MD'), ('rather', 'RB')), 18),
 ((('over', 'IN'), ('the', 'DT')), 18),
 ((('Upon', 'IN'), ('word', 'NN')), 18),
 ((('without', 'IN'), ('the', 'DT')), 18),
 ((('that', 'IN'), ('Mrs.', 'NNP')), 18),
 ((('Box', 'NNP'), ('Hill', 'NNP')), 18),
 ((('and', 'CC'), ('their', 'PRP$')), 17),
 ((('from', 'IN'), ('his', 'PRP$')), 17),
 ((('They', 'PRP'), ('are', 'VBP')), 17),
 ((('very', 'RB'), ('true', 'JJ')), 17),
 ((('how', 'WRB'), ('much', 'JJ')), 17),
 ((('their', 'PRP$'), ('own', 'JJ')), 17),
 ((('Highbury', 'NNP'), ('and', 'CC')), 17),
 ((('the', 'DT'), ('worst', 'JJS')), 17),
 ((('but', 'CC'), ('his', 'PRP$')), 17),
 ((('time', 'NN'), ('for', 'IN')), 17),
 ((('which', 'WDT'), ('her', 'PRP$')), 17),
 ((('her', 'PRP$'), ('She', 'PRP')), 17),
 ((('make', 'VB'), ('the', 'DT')), 17),
 ((('far', 'RB'), ('from', 'IN')), 17),
 ((('have', 'VBP'), ('doubt', 'NN')), 17),
 ((('very', 'RB'), ('bad', 'JJ')), 17),
 ((('that', 'IN'), ('his', 'PRP$')), 17),
 ((('they', 'PRP'), ('will', 'MD')), 17),
 ((('herself', 'PRP'), ('and', 'CC')), 17),
 ((('that', 'IN'), ('have', 'VBP')), 17),
 ((('had', 'VBD'), ('she', 'PRP')), 17),
 ((('you', 'PRP'), ('said', 'VBD')), 17),
 ((('not', 'RB'), ('bear', 'VB')), 17),
 ((('the', 'DT'), ('better', 'JJR')), 17),
 ((('hope', 'VBP'), ('you', 'PRP')), 17),
 ((('she', 'PRP'), ('does', 'VBZ')), 17),
 ((('with', 'IN'), ('great', 'JJ')), 17),
 ((('morning', 'NN'), ('and', 'CC')), 17),
 ((('friend', 'NN'), ('and', 'CC')), 16),
 ((('every', 'DT'), ('day', 'NN')), 16),
 ((('her', 'PRP'), ('from', 'IN')), 16),
 ((('was', 'VBD'), ('going', 'VBG')), 16),
 ((('his', 'PRP$'), ('life', 'NN')), 16),
 ((('last', 'JJ'), ('night', 'NN')), 16),
 ((('and', 'CC'), ('this', 'DT')), 16),
 ((('did', 'VBD'), ('you', 'PRP')), 16),
 ((('you', 'PRP'), ('please', 'VBP')), 16),
 ((('not', 'RB'), ('being', 'VBG')), 16),
 ((('but', 'CC'), ('one', 'CD')), 16),
 ((('not', 'RB'), ('make', 'VB')), 16),
 ((('not', 'RB'), ('like', 'IN')), 16),
 ((('You', 'PRP'), ('have', 'VBP')), 16),
 ((('think', 'VBP'), ('you', 'PRP')), 16),
 ((('was', 'VBD'), ('rather', 'RB')), 16),
 ((('was', 'VBD'), ('soon', 'RB')), 16),
 ((('the', 'DT'), ('name', 'NN')), 16),
 ((('that', 'IN'), ('any', 'DT')), 16),
 ((('was', 'VBD'), ('such', 'JJ')), 16),
 ((('what', 'WP'), ('had', 'VBD')), 16),
 ((('beyond', 'IN'), ('the', 'DT')), 16),
 ((('they', 'PRP'), ('would', 'MD')), 16),
 ((('are', 'VBP'), ('all', 'DT')), 16),
 ((('much', 'RB'), ('more', 'RBR')), 16),
 ((('exactly', 'RB'), ('the', 'DT')), 16),
 ((('the', 'DT'), ('truth', 'NN')), 16),
 ((('not', 'RB'), ('wish', 'VB')), 16),
 ((('from', 'IN'), ('Mr.', 'NNP')), 16),
 ((('you', 'PRP'), ('think', 'VBP')), 16),
 ((('have', 'VB'), ('made', 'VBN')), 16),
 ((('have', 'VBP'), ('done', 'VBN')), 16),
 ((('not', 'RB'), ('she', 'PRP')), 16),
 ((('her', 'PRP$'), ('very', 'RB')), 16),
 ((('for', 'IN'), ('your', 'PRP$')), 16),
 ((('not', 'RB'), ('imagine', 'VB')), 16),
 ((('only', 'RB'), ('the', 'DT')), 16),
 ((('will', 'MD'), ('have', 'VB')), 16),
 ((('feel', 'VB'), ('that', 'IN')), 16),
 ((('better', 'JJR'), ('than', 'IN')), 16),
 ((('between', 'IN'), ('them', 'PRP')), 16),
 ((('there', 'EX'), ('were', 'VBD')), 16),
 ((('before', 'IN'), ('her', 'PRP$')), 16),
 ((('that', 'WDT'), ('was', 'VBD')), 16),
 ((('the', 'DT'), ('moment', 'NN')), 16),
 ((('was', 'VBD'), ('that', 'IN')), 16),
 ((('for', 'IN'), ('Mrs.', 'NNP')), 16),
 ((('how', 'WRB'), ('she', 'PRP')), 15),
 ((('must', 'MD'), ('the', 'DT')), 15),
 ((('house', 'NN'), ('and', 'CC')), 15),
 ((('she', 'PRP'), ('were', 'VBD')), 15),
 ((('but', 'CC'), ('her', 'PRP$')), 15),
 ((('but', 'CC'), ('can', 'MD')), 15),
 ((('Woodhouse', 'NNP'), ('and', 'CC')), 15),
 ((('would', 'MD'), ('very', 'RB')), 15),
 ((('she', 'PRP'), ('can', 'MD')), 15),
 ((('man', 'NN'), ('and', 'CC')), 15),
 ((('take', 'VB'), ('care', 'NN')), 15),
 ((('for', 'IN'), ('she', 'PRP')), 15),
 ((('take', 'VB'), ('the', 'DT')), 15),
 ((('the', 'DT'), ('little', 'JJ')), 15),
 ((('his', 'PRP$'), ('son', 'NN')), 15),
 ((('though', 'IN'), ('the', 'DT')), 15),
 ((('among', 'IN'), ('the', 'DT')), 15),
 ((('thing', 'NN'), ('but', 'CC')), 15),
 ((('young', 'JJ'), ('ladies', 'NNS')), 15),
 ((('see', 'VB'), ('her', 'PRP$')), 15),
 ((('they', 'PRP'), ('must', 'MD')), 15),
 ((('had', 'VBD'), ('given', 'VBN')), 15),
 ((('there', 'EX'), ('are', 'VBP')), 15),
 ((('see', 'VB'), ('you', 'PRP')), 15),
 ((('and', 'CC'), ('should', 'MD')), 15),
 ((('not', 'RB'), ('see', 'VB')), 15),
 ((('quite', 'RB'), ('the', 'DT')), 15),
 ((('Thank', 'NNP'), ('you', 'PRP')), 15),
 ((('Very', 'RB'), ('well', 'RB')), 15),
 ((('shall', 'MD'), ('have', 'VB')), 15),
 ((('But', 'CC'), ('she', 'PRP')), 15),
 ((('was', 'VBD'), ('only', 'RB')), 15),
 ((('but', 'CC'), ('that', 'IN')), 15),
 ((('than', 'IN'), ('had', 'VBD')), 15),
 ((('before', 'IN'), ('and', 'CC')), 15),
 ((('from', 'IN'), ('him', 'PRP')), 15),
 ((('replied', 'VBD'), ('Emma', 'NNP')), 15),
 ((('was', 'VBD'), ('still', 'RB')), 15),
 ((('what', 'WP'), ('would', 'MD')), 15),
 ((('about', 'IN'), ('him', 'PRP')), 15),
 ((('Knightley', 'NNP'), ('was', 'VBD')), 15),
 ((('any', 'DT'), ('such', 'JJ')), 15),
 ((('She', 'PRP'), ('did', 'VBD')), 15),
 ((('her', 'PRP'), ('that', 'IN')), 15),
 ((('more', 'JJR'), ('and', 'CC')), 15),
 ((('Have', 'VBP'), ('you', 'PRP')), 15),
 ((('the', 'DT'), ('matter', 'NN')), 15),
 ((('how', 'WRB'), ('could', 'MD')), 15),
 ((('Miss', 'NNP'), ('Hawkins', 'NNP')), 15),
 ((('was', 'VBD'), ('more', 'RBR')), 14),
 ((('just', 'RB'), ('what', 'WP')), 14),
 ((('however', 'RB'), ('was', 'VBD')), 14),
 ((('the', 'DT'), ('family', 'NN')), 14),
 ((('could', 'MD'), ('never', 'RB')), 14),
 ((('for', 'IN'), ('having', 'VBG')), 14),
 ((('who', 'WP'), ('could', 'MD')), 14),
 ((('and', 'CC'), ('sure', 'JJ')), 14),
 ((('sure', 'JJ'), ('she', 'PRP')), 14),
 ((('this', 'DT'), ('time', 'NN')), 14),
 ((('that', 'IN'), ('must', 'MD')), 14),
 ((('and', 'CC'), ('what', 'WP')), 14),
 ((('been', 'VBN'), ('used', 'VBN')), 14),
 ((('and', 'CC'), ('shall', 'MD')), 14),
 ((('have', 'VB'), ('her', 'PRP$')), 14),
 ((('without', 'IN'), ('any', 'DT')), 14),
 ((('the', 'DT'), ('more', 'RBR')), 14),
 ((('brother', 'NN'), ('and', 'CC')), 14),
 ((('and', 'CC'), ('did', 'VBD')), 14),
 ((('ought', 'MD'), ('have', 'VB')), 14),
 ((('the', 'DT'), ('child', 'NN')), 14),
 ((('Churchill', 'NNP'), ('was', 'VBD')), 14),
 ((('was', 'VBD'), ('just', 'RB')), 14),
 ((('the', 'DT'), ('end', 'NN')), 14),
 ((('which', 'WDT'), ('could', 'MD')), 14),
 ((('had', 'VBD'), ('taken', 'VBN')), 14),
 ((('had', 'VBD'), ('had', 'VBN')), 14),
 ((('while', 'IN'), ('she', 'PRP')), 14),
 ((('the', 'DT'), ('Abbey', 'NNP')), 14),
 ((('for', 'IN'), ('though', 'IN')), 14),
 ((('that', 'IN'), ('might', 'MD')), 14),
 ((('there', 'EX'), ('will', 'MD')), 14),
 ((('been', 'VBN'), ('able', 'JJ')), 14),
 ((('time', 'NN'), ('and', 'CC')), 14),
 ((('himself', 'PRP'), ('and', 'CC')), 14),
 ((('was', 'VBD'), ('really', 'RB')), 14),
 ((('have', 'VBP'), ('seen', 'VBN')), 14),
 ((('She', 'PRP'), ('will', 'MD')), 14),
 ((('not', 'RB'), ('give', 'VB')), 14),
 ((('away', 'RB'), ('and', 'CC')), 14),
 ((('Mrs.', 'NNP'), ('John', 'NNP')), 14),
 ((('the', 'DT'), ('point', 'NN')), 14),
 ((('who', 'WP'), ('was', 'VBD')), 14),
 ((('give', 'VB'), ('the', 'DT')), 14),
 ((('not', 'RB'), ('believe', 'VB')), 14),
 ((('make', 'VB'), ('her', 'PRP$')), 14),
 ((('seeing', 'VBG'), ('him', 'PRP')), 14),
 ((('the', 'DT'), ('table', 'NN')), 14),
 ((('that', 'IN'), ('can', 'MD')), 14),
 ((('thing', 'NN'), ('that', 'WDT')), 14),
 ((('the', 'DT'), ('morning', 'NN')), 14),
 ((('rest', 'NN'), ('the', 'DT')), 14),
 ((('they', 'PRP'), ('could', 'MD')), 14),
 ((('They', 'PRP'), ('had', 'VBD')), 14),
 ((('never', 'RB'), ('had', 'VBD')), 14),
 ((('told', 'VBD'), ('you', 'PRP')), 14),
 ((('you', 'PRP'), ('did', 'VBD')), 14),
 ((('said', 'VBD'), ('and', 'CC')), 14),
 ((('Churchill', 'NNP'), ('had', 'VBD')), 14),
 ((('without', 'IN'), ('being', 'VBG')), 14),
 ((('Mrs.', 'NNP'), ('Dixon', 'NNP')), 14),
 ((('dear', 'JJ'), ('Jane', 'NNP')), 14),
 ((('the', 'DT'), ('ball', 'NN')), 14),
 ((('the', 'DT'), ('power', 'NN')), 13),
 ((('not', 'RB'), ('any', 'DT')), 13),
 ((('had', 'VBD'), ('always', 'RB')), 13),
 ((('the', 'DT'), ('match', 'NN')), 13),
 ((('was', 'VBD'), ('much', 'JJ')), 13),
 ((('but', 'CC'), ('with', 'IN')), 13),
 ((('for', 'IN'), ('them', 'PRP')), 13),
 ((('have', 'VB'), ('had', 'VBN')), 13),
 ((('are', 'VBP'), ('the', 'DT')), 13),
 ((('very', 'RB'), ('sure', 'JJ')), 13),
 ((('poor', 'JJ'), ('Miss', 'NNP')), 13),
 ((('the', 'DT'), ('question', 'NN')), 13),
 ((('was', 'VBD'), ('one', 'CD')), 13),
 ((('made', 'VBD'), ('the', 'DT')), 13),
 ((('said', 'VBD'), ('that', 'IN')), 13),
 ((('that', 'IN'), ('Miss', 'NNP')), 13),
 ((('with', 'IN'), ('much', 'JJ')), 13),
 ((('for', 'IN'), ('Miss', 'NNP')), 13),
 ((('who', 'WP'), ('were', 'VBD')), 13),
 ((('had', 'VBD'), ('made', 'VBN')), 13),
 ((('the', 'DT'), ('visit', 'NN')), 13),
 ((('the', 'DT'), ('hope', 'NN')), 13),
 ((('She', 'PRP'), ('felt', 'VBD')), 13),
 ((('had', 'VBD'), ('passed', 'VBN')), 13),
 ((('mother', 'NN'), ('and', 'CC')), 13),
 ((('was', 'VBD'), ('great', 'JJ')), 13),
 ((('that', 'DT'), ('was', 'VBD')), 13),
 ((('and', 'CC'), ('while', 'IN')), 13),
 ((('very', 'RB'), ('often', 'RB')), 13),
 ((('part', 'NN'), ('the', 'DT')), 13),
 ((('Miss', 'NNP'), ('Nash', 'NNP')), 13),
 ((('think', 'VB'), ('him', 'PRP')), 13),
 ((('who', 'WP'), ('would', 'MD')), 13),
 ((('next', 'JJ'), ('day', 'NN')), 13),
 ((('was', 'VBD'), ('her', 'PRP$')), 13),
 ((('the', 'DT'), ('usual', 'JJ')), 13),
 ((('there', 'EX'), ('being', 'VBG')), 13),
 ((('soon', 'RB'), ('afterwards', 'NNS')), 13),
 ((('the', 'DT'), ('weather', 'NN')), 13),
 ((('made', 'VBN'), ('her', 'PRP$')), 13),
 ((('very', 'RB'), ('few', 'JJ')), 13),
 ((('said', 'VBD'), ('the', 'DT')), 13),
 ((('had', 'VBD'), ('great', 'JJ')), 13),
 ((('your', 'PRP$'), ('friend', 'NN')), 13),
 ((('and', 'CC'), ('nothing', 'NN')), 13),
 ((('Emma', 'NNP'), ('said', 'VBD')), 13),
 ((('she', 'PRP'), ('ought', 'MD')), 13),
 ((('shall', 'MD'), ('never', 'RB')), 13),
 ((('you', 'PRP'), ('that', 'IN')), 13),
 ((('not', 'RB'), ('often', 'RB')), 13),
 ((('sure', 'JJ'), ('that', 'IN')), 13),
 ((('the', 'DT'), ('lady', 'NN')), 13),
 ((('she', 'PRP'), ('felt', 'VBD')), 13),
 ((('not', 'RB'), ('even', 'RB')), 13),
 ((('for', 'IN'), ('few', 'JJ')), 13),
 ((('came', 'VBD'), ('and', 'CC')), 13),
 ((('was', 'VBD'), ('but', 'CC')), 13),
 ((('not', 'RB'), ('let', 'VB')), 13),
 ((('him', 'PRP'), ('with', 'IN')), 13),
 ((('there', 'EX'), ('had', 'VBD')), 13),
 ((('and', 'CC'), ('little', 'JJ')), 13),
 ((('even', 'RB'), ('the', 'DT')), 13),
 ((('great', 'JJ'), ('pleasure', 'NN')), 13),
 ((('that', 'IN'), ('Jane', 'NNP')), 13),
 ((('You', 'PRP'), ('may', 'MD')), 13),
 ((('Emma', 'NNP'), ('would', 'MD')), 13),
 ((('the', 'DT'), ('instrument', 'NN')), 13),
 ((('William', 'NNP'), ('Larkins', 'NNP')), 13),
 ((('way', 'NN'), ('and', 'CC')), 12),
 ((('all', 'PDT'), ('his', 'PRP$')), 12),
 ((('any', 'DT'), ('time', 'NN')), 12),
 ((('give', 'VB'), ('her', 'PRP$')), 12),
 ((('when', 'WRB'), ('was', 'VBD')), 12),
 ((('the', 'DT'), ('poor', 'JJ')), 12),
 ((('all', 'PDT'), ('that', 'DT')), 12),
 ((('very', 'RB'), ('glad', 'JJ')), 12),
 ((('about', 'IN'), ('her', 'PRP$')), 12),
 ((('for', 'IN'), ('some', 'DT')), 12),
 ((('Knightley', 'NNP'), ('had', 'VBD')), 12),
 ((('Woodhouse', 'NNP'), ('you', 'PRP')), 12),
 ((('have', 'VB'), ('him', 'PRP')), 12),
 ((('her', 'PRP$'), ('not', 'RB')), 12),
 ((('and', 'CC'), ('were', 'VBD')), 12),
 ((('for', 'IN'), ('Emma', 'NNP')), 12),
 ((('but', 'CC'), ('must', 'MD')), 12),
 ((('need', 'MD'), ('not', 'RB')), 12),
 ((('between', 'IN'), ('the', 'DT')), 12),
 ((('come', 'VB'), ('and', 'CC')), 12),
 ((('made', 'VBD'), ('him', 'PRP')), 12),
 ((('she', 'PRP'), ('wanted', 'VBD')), 12),
 ((('the', 'DT'), ('Churchills', 'NNP')), 12),
 ((('had', 'VBD'), ('gone', 'VBN')), 12),
 ((('more', 'RBR'), ('than', 'IN')), 12),
 ((('never', 'RB'), ('saw', 'VBD')), 12),
 ((('good', 'JJ'), ('sense', 'NN')), 12),
 ((('had', 'VBD'), ('already', 'RB')), 12),
 ((('which', 'WDT'), ('Emma', 'NNP')), 12),
 ((('always', 'RB'), ('the', 'DT')), 12),
 ((('and', 'CC'), ('having', 'VBG')), 12),
 ((('Emma', 'NNP'), ('found', 'VBD')), 12),
 ((('end', 'NN'), ('the', 'DT')), 12),
 ((('had', 'VBD'), ('just', 'RB')), 12),
 ((('being', 'VBG'), ('very', 'RB')), 12),
 ((('had', 'VBD'), ('very', 'RB')), 12),
 ((('and', 'CC'), ('looked', 'VBD')), 12),
 ((('But', 'CC'), ('the', 'DT')), 12),
 ((('she', 'PRP'), ('came', 'VBD')), 12),
 ((('day', 'NN'), ('and', 'CC')), 12),
 ((('and', 'CC'), ('that', 'DT')), 12),
 ((('when', 'WRB'), ('Mr.', 'NNP')), 12),
 ((('are', 'VBP'), ('quite', 'RB')), 12),
 ((('quite', 'RB'), ('well', 'RB')), 12),
 ((('had', 'VBD'), ('better', 'RBR')), 12),
 ((('that', 'IN'), ('not', 'RB')), 12),
 ((('his', 'PRP$'), ('side', 'NN')), 12),
 ((('know', 'VBP'), ('you', 'PRP')), 12),
 ((('them', 'PRP'), ('both', 'DT')), 12),
 ((('should', 'MD'), ('like', 'VB')), 12),
 ((('cried', 'VBD'), ('Mr.', 'NNP')), 12),
 ((('How', 'WRB'), ('could', 'MD')), 12),
 ((('soon', 'RB'), ('she', 'PRP')), 12),
 ((('her', 'PRP$'), ('hand', 'NN')), 12),
 ((('was', 'VBD'), ('gone', 'VBN')), 12),
 ((('dear', 'JJ'), ('sir', 'NN')), 12),
 ((('but', 'CC'), ('very', 'RB')), 12),
 ((('not', 'RB'), ('speak', 'VB')), 12),
 ((('never', 'RB'), ('have', 'VB')), 12),
 ((('with', 'IN'), ('Harriet', 'NNP')), 12),
 ((('which', 'WDT'), ('made', 'VBD')), 12),
 ((('and', 'CC'), ('found', 'VBD')), 12),
 ((('Elton', 'NNP'), ('had', 'VBD')), 12),
 ((('would', 'MD'), ('the', 'DT')), 12),
 ((('out', 'RP'), ('the', 'DT')), 12),
 ((('and', 'CC'), ('sister', 'NN')), 12),
 ((('the', 'DT'), ('sight', 'NN')), 12),
 ((('nothing', 'NN'), ('else', 'RB')), 12),
 ((('one', 'CD'), ('can', 'MD')), 12),
 ((('would', 'MD'), ('soon', 'RB')), 12),
 ((('she', 'PRP'), ('and', 'CC')), 12),
 ((('she', 'PRP'), ('hoped', 'VBD')), 12),
 ((('thing', 'NN'), ('was', 'VBD')), 12),
 ((('been', 'VBN'), ('long', 'RB')), 12),
 ((('such', 'PDT'), ('thing', 'NN')), 12),
 ((('had', 'VBD'), ('better', 'JJR')), 12),
 ((('that', 'IN'), ('did', 'VBD')), 12),
 ((('you', 'PRP'), ('might', 'MD')), 12),
 ((('nothing', 'NN'), ('the', 'DT')), 12),
 ((('this', 'DT'), ('very', 'RB')), 12),
 ((('the', 'DT'), ('Coles', 'NNP')), 12),
 ((('Miss', 'NNP'), ('Campbell', 'NNP')), 12),
 ((('glad', 'JJ'), ('see', 'VB')), 12),
 ((('said', 'VBD'), ('was', 'VBD')), 12),
 ((('have', 'VBP'), ('the', 'DT')), 12),
 ((('the', 'DT'), ('pianoforte', 'NN')), 12),
 ((('was', 'VBD'), ('extremely', 'RB')), 12),
 ((('with', 'IN'), ('very', 'RB')), 11),
 ((('with', 'IN'), ('what', 'WP')), 11),
 ((('their', 'PRP$'), ('being', 'VBG')), 11),
 ((('and', 'CC'), ('give', 'VB')), 11),
 ((('not', 'RB'), ('one', 'CD')), 11),
 ((('with', 'IN'), ('Miss', 'NNP')), 11),
 ((('and', 'CC'), ('from', 'IN')), 11),
 ((('thing', 'NN'), ('for', 'IN')), 11),
 ((('the', 'DT'), ('advantage', 'NN')), 11),
 ((('You', 'PRP'), ('know', 'VBP')), 11),
 ((('have', 'VBP'), ('great', 'JJ')), 11),
 ((('and', 'CC'), ('never', 'RB')), 11),
 ((('wish', 'VBP'), ('you', 'PRP')), 11),
 ((('Emma', 'NNP'), ('but', 'CC')), 11),
 ((('only', 'RB'), ('one', 'CD')), 11),
 ((('she', 'PRP'), ('really', 'RB')), 11),
 ((('all', 'DT'), ('that', 'WDT')), 11),
 ((('and', 'CC'), ('has', 'VBZ')), 11),
 ((('with', 'IN'), ('some', 'DT')), 11),
 ((('had', 'VBD'), ('received', 'VBN')), 11),
 ((('never', 'RB'), ('seen', 'VBN')), 11),
 ((('the', 'DT'), ('great', 'JJ')), 11),
 ((('soon', 'RB'), ('after', 'IN')), 11),
 ((('but', 'CC'), ('they', 'PRP')), 11),
 ((('Mrs.', 'NNP'), ('Perry', 'NNP')), 11),
 ((('few', 'JJ'), ('days', 'NNS')), 11),
 ((('Woodhouse', 'NNP'), ('would', 'MD')), 11),
 ((('made', 'VBD'), ('her', 'PRP$')), 11),
 ((('one', 'CD'), ('morning', 'NN')), 11),
 ((('the', 'DT'), ('common', 'JJ')), 11),
 ((('for', 'IN'), ('their', 'PRP$')), 11),
 ((('you', 'PRP'), ('Miss', 'NNP')), 11),
 ((('Hartfield', 'NNP'), ('was', 'VBD')), 11),
 ((('Harriet', 'NNP'), ('would', 'MD')), 11),
 ((('was', 'VBD'), ('ready', 'JJ')), 11),
 ((('but', 'CC'), ('this', 'DT')), 11),
 ((('and', 'CC'), ('one', 'CD')), 11),
 ((('little', 'JJ'), ('friend', 'NN')), 11),
 ((('all', 'PDT'), ('this', 'DT')), 11),
 ((('because', 'IN'), ('she', 'PRP')), 11),
 ((('thing', 'NN'), ('else', 'RB')), 11),
 ((('was', 'VBD'), ('with', 'IN')), 11),
 ((('was', 'VBD'), ('sure', 'JJ')), 11),
 ((('can', 'MD'), ('have', 'VB')), 11),
 ((('who', 'WP'), ('are', 'VBP')), 11),
 ((('but', 'CC'), ('would', 'MD')), 11),
 ((('will', 'MD'), ('very', 'RB')), 11),
 ((('you', 'PRP'), ('the', 'DT')), 11),
 ((('during', 'IN'), ('the', 'DT')), 11),
 ((('ever', 'RB'), ('since', 'IN')), 11),
 ((('never', 'RB'), ('could', 'MD')), 11),
 ((('you', 'PRP'), ('any', 'DT')), 11),
 ((('much', 'RB'), ('obliged', 'VBN')), 11),
 ((('obliged', 'VBN'), ('you', 'PRP')), 11),
 ((('not', 'RB'), ('seem', 'VB')), 11),
 ((('think', 'VB'), ('the', 'DT')), 11),
 ((('not', 'RB'), ('feel', 'VB')), 11),
 ((('you', 'PRP'), ('ever', 'RB')), 11),
 ((('whether', 'IN'), ('she', 'PRP')), 11),
 ((('and', 'CC'), ('see', 'VB')), 11),
 ((('her', 'PRP$'), ('face', 'NN')), 11),
 ((('dear', 'JJ'), ('Miss', 'NNP')), 11),
 ((('but', 'CC'), ('now', 'RB')), 11),
 ((('felt', 'VBD'), ('the', 'DT')), 11),
 ((('say', 'VB'), ('and', 'CC')), 11),
 ((('that', 'DT'), ('moment', 'NN')), 11),
 ((('her', 'PRP$'), ('eyes', 'NNS')), 11),
 ((('Woodhouse', 'NNP'), ('who', 'WP')), 11),
 ((('the', 'DT'), ('man', 'NN')), 11),
 ((('the', 'DT'), ('purpose', 'NN')), 11),
 ((('very', 'RB'), ('likely', 'JJ')), 11),
 ((('her', 'PRP$'), ('side', 'NN')), 11),
 ((('You', 'PRP'), ('and', 'CC')), 11),
 ((('they', 'PRP'), ('have', 'VBP')), 11),
 ((('which', 'WDT'), ('they', 'PRP')), 11),
 ((('could', 'MD'), ('hardly', 'RB')), 11),
 ((('are', 'VBP'), ('too', 'RB')), 11),
 ((('his', 'PRP$'), ('brother', 'NN')), 11),
 ((('dare', 'VBP'), ('say', 'VBP')), 11),
 ((('for', 'IN'), ('every', 'DT')), 11),
 ((('feelings', 'NNS'), ('and', 'CC')), 11),
 ((('could', 'MD'), ('wish', 'VB')), 11),
 ((('his', 'PRP$'), ('coming', 'VBG')), 11),
 ((('the', 'DT'), ('happiness', 'NN')), 11),
 ((('upon', 'IN'), ('word', 'NN')), 11),
 ((('Churchill', 'NNP'), ('and', 'CC')), 11),
 ((('will', 'MD'), ('the', 'DT')), 11),
 ((('answer', 'VB'), ('for', 'IN')), 11),
 ((('poor', 'JJ'), ('Harriet', 'NNP')), 11),
 ((('that', 'DT'), ('sort', 'NN')), 11),
 ((('much', 'RB'), ('more', 'JJR')), 11),
 ((('man', 'NN'), ('who', 'WP')), 11),
 ((('five', 'CD'), ('minutes', 'NNS')), 11),
 ((('Fairfax', 'NNP'), ('was', 'VBD')), 11),
 ((('the', 'DT'), ('person', 'NN')), 11),
 ((('much', 'JJ'), ('her', 'PRP$')), 10),
 ((('over', 'IN'), ('and', 'CC')), 10),
 ((('years', 'NNS'), ('old', 'JJ')), 10),
 ((('from', 'IN'), ('them', 'PRP')), 10),
 ((('the', 'DT'), ('difference', 'NN')), 10),
 ((('heart', 'NN'), ('and', 'CC')), 10),
 ((('Isabella', 'NNP'), ('and', 'CC')), 10),
 ((('Highbury', 'NNP'), ('the', 'DT')), 10),
 ((('her', 'PRP$'), ('but', 'CC')), 10),
 ((('impossible', 'JJ'), ('for', 'IN')), 10),
 ((('And', 'CC'), ('you', 'PRP')), 10),
 ((('how', 'WRB'), ('very', 'RB')), 10),
 ((('and', 'CC'), ('always', 'RB')), 10),
 ((('than', 'IN'), ('usual', 'JJ')), 10),
 ((('Brunswick', 'NNP'), ('Square', 'NNP')), 10),
 ((('you', 'PRP'), ('Mr.', 'NNP')), 10),
 ((('have', 'VB'), ('had', 'VBD')), 10),
 ((('said', 'VBD'), ('her', 'PRP$')), 10),
 ((('But', 'CC'), ('Mr.', 'NNP')), 10),
 ((('very', 'RB'), ('sorry', 'JJ')), 10),
 ((('his', 'PRP$'), ('head', 'NN')), 10),
 ((('Weston', 'NNP'), ('who', 'WP')), 10),
 ((('like', 'IN'), ('Mr.', 'NNP')), 10),
 ((('after', 'IN'), ('all', 'DT')), 10),
 ((('Depend', 'NNP'), ('upon', 'IN')), 10),
 ((('two', 'CD'), ('three', 'CD')), 10),
 ((('was', 'VBD'), ('nothing', 'NN')), 10),
 ((('and', 'CC'), ('some', 'DT')), 10),
 ((('give', 'VB'), ('him', 'PRP')), 10),
 ((('been', 'VBN'), ('there', 'RB')), 10),
 ((('sense', 'NN'), ('and', 'CC')), 10),
 ((('And', 'CC'), ('then', 'RB')), 10),
 ((('body', 'NN'), ('and', 'CC')), 10),
 ((('against', 'IN'), ('the', 'DT')), 10),
 ((('with', 'IN'), ('many', 'JJ')), 10),
 ((('not', 'RB'), ('much', 'JJ')), 10),
 ((('pleased', 'VBN'), ('with', 'IN')), 10),
 ((('from', 'IN'), ('Mrs.', 'NNP')), 10),
 ((('all', 'DT'), ('that', 'DT')), 10),
 ((('the', 'DT'), ('country', 'NN')), 10),
 ((('and', 'CC'), ('quite', 'RB')), 10),
 ((('her', 'PRP$'), ('she', 'PRP')), 10),
 ((('while', 'IN'), ('the', 'DT')), 10),
 ((('gratitude', 'NN'), ('and', 'CC')), 10),
 ((('her', 'PRP$'), ('thoughts', 'NNS')), 10),
 ((('indeed', 'RB'), ('and', 'CC')), 10),
 ((('one', 'CD'), ('day', 'NN')), 10),
 ((('had', 'VBD'), ('come', 'VBN')), 10),
 ((('his', 'PRP$'), ('way', 'NN')), 10),
 ((('made', 'VBD'), ('her', 'PRP')), 10),
 ((('said', 'VBD'), ('Harriet', 'NNP')), 10),
 ((('that', 'IN'), ('their', 'PRP$')), 10),
 ((('doubt', 'NN'), ('that', 'IN')), 10),
 ((('Emma', 'NNP'), ('thought', 'VBD')), 10),
 ((('his', 'PRP$'), ('manners', 'NNS')), 10),
 ((('and', 'CC'), ('without', 'IN')), 10),
 ((('there', 'EX'), ('could', 'MD')), 10),
 ((('the', 'DT'), ('comfort', 'NN')), 10),
 ((('great', 'JJ'), ('many', 'JJ')), 10),
 ((('will', 'MD'), ('never', 'RB')), 10),
 ((('you', 'PRP'), ('but', 'CC')), 10),
 ((('could', 'MD'), ('you', 'PRP')), 10),
 ((('perhaps', 'RB'), ('the', 'DT')), 10),
 ((('could', 'MD'), ('only', 'RB')), 10),
 ((('all', 'DT'), ('was', 'VBD')), 10),
 ((('most', 'RBS'), ('happy', 'JJ')), 10),
 ((('given', 'VBN'), ('her', 'PRP$')), 10),
 ((('letter', 'NN'), ('was', 'VBD')), 10),
 ((('her', 'PRP'), ('very', 'RB')), 10),
 ((('give', 'VB'), ('you', 'PRP')), 10),
 ((('not', 'RB'), ('without', 'IN')), 10),
 ((('and', 'CC'), ('can', 'MD')), 10),
 ((('and', 'CC'), ('are', 'VBP')), 10),
 ((('not', 'RB'), ('consider', 'VB')), 10),
 ((('quarter', 'NN'), ('hour', 'NN')), 10),
 ((('sat', 'VBD'), ('down', 'RB')), 10),
 ((('the', 'DT'), ('good', 'JJ')), 10),
 ((('She', 'PRP'), ('has', 'VBZ')), 10),
 ((('with', 'IN'), ('smile', 'NN')), 10),
 ((('what', 'WP'), ('the', 'DT')), 10),
 ((('saw', 'VBD'), ('her', 'PRP')), 10),
 ((('and', 'CC'), ('manner', 'NN')), 10),
 ((('the', 'DT'), ('highest', 'JJS')), 10),
 ((('have', 'VBP'), ('always', 'RB')), 10),
 ((('not', 'RB'), ('find', 'VB')), 10),
 ((('she', 'PRP'), ('very', 'RB')), 10),
 ((('satisfied', 'VBN'), ('with', 'IN')), 10),
 ((('Elton', 'NNP'), ('the', 'DT')), 10),
 ((('came', 'VBD'), ('back', 'RB')), 10),
 ((('there', 'EX'), ('must', 'MD')), 10),
 ((('was', 'VBD'), ('and', 'CC')), 10),
 ((('And', 'CC'), ('the', 'DT')), 10),
 ((('that', 'IN'), ('has', 'VBZ')), 10),
 ((('And', 'CC'), ('how', 'WRB')), 10),
 ((('you', 'PRP'), ('You', 'PRP')), 10),
 ((('who', 'WP'), ('can', 'MD')), 10),
 ((('may', 'MD'), ('well', 'RB')), 10),
 ((('not', 'RB'), ('wonder', 'VB')), 10),
 ((('the', 'DT'), ('honour', 'NN')), 10),
 ((('that', 'IN'), ('when', 'WRB')), 10),
 ((('under', 'IN'), ('the', 'DT')), 10),
 ((('her', 'PRP$'), ('attention', 'NN')), 10),
 ((('Colonel', 'NNP'), ('and', 'CC')), 10),
 ((('away', 'RB'), ('from', 'IN')), 10),
 ((('his', 'PRP$'), ('opinion', 'NN')), 10),
 ((('that', 'IN'), ('may', 'MD')), 10),
 ((('was', 'VBD'), ('done', 'VBN')), 10),
 ((('can', 'MD'), ('see', 'VB')), 10),
 ((('time', 'NN'), ('the', 'DT')), 10),
 ((('which', 'WDT'), ('did', 'VBD')), 10),
 ((('him', 'PRP'), ('her', 'PRP$')), 10),
 ((('distance', 'NN'), ('from', 'IN')), 10),
 ((('was', 'VBD'), ('every', 'DT')), 10),
 ((('than', 'IN'), ('can', 'MD')), 10),
 ((('Jane', 'NNP'), ('was', 'VBD')), 10),
 ((('Jane', 'NNP'), ('and', 'CC')), 10),
 ((('Bates', 'NNP'), ('was', 'VBD')), 10),
 ((('together', 'RB'), ('and', 'CC')), 10),
 ((('young', 'JJ'), ('people', 'NNS')), 10),
 ((('And', 'CC'), ('now', 'RB')), 10),
 ((('had', 'VBD'), ('brought', 'VBN')), 10),
 ((('said', 'VBD'), ('Frank', 'NNP')), 10),
 ((('and', 'CC'), ('soon', 'RB')), 10),
 ((('that', 'WDT'), ('would', 'MD')), 10),
 ((('How', 'WRB'), ('you', 'PRP')), 10),
 ((('not', 'RB'), ('yet', 'RB')), 10),
 ((('the', 'DT'), ('Eltons', 'NNS')), 10),
 ((('Mr.', 'NNP'), ('Suckling', 'NNP')), 10),
 ((('and', 'CC'), ('happy', 'JJ')), 9),
 ((('her', 'PRP'), ('She', 'PRP')), 9),
 ((('they', 'PRP'), ('did', 'VBD')), 9),
 ((('Her', 'PRP$'), ('father', 'NN')), 9),
 ((('after', 'IN'), ('dinner', 'NN')), 9),
 ((('was', 'VBD'), ('she', 'PRP')), 9),
 ((('the', 'DT'), ('change', 'NN')), 9),
 ((('natural', 'JJ'), ('and', 'CC')), 9),
 ((('Woodhouse', 'NNP'), ('had', 'VBD')), 9),
 ...]

한글 적용!

In [218]:
from konlpy.corpus import kolaw

corpus = kolaw.open(kolaw.fileids()[0]).read()
tokens = Kkma().pos(corpus)

# 단어 따로, 품사 따로 출력할 수 있음. 
tokens = [pair[0] for pair in Kkma().pos(corpus)]

# 쌍을 찾을 것
bigram = BigramCollocationFinder.from_words(tokens)
# pmi 가지고 Best 쌍 찾기
bigram.nbest(BigramAssocMeasures().pmi, 10)
# chi_sq 가지고 Best 쌍 찾기
# 단어를 다 별도로 인식했을 것. 
bigram.nbest(BigramAssocMeasures().chi_sq, 10)
Out[218]:
[('가부', '동수'),
 ('강제', '노역'),
 ('경자', '유전'),
 ('공무', '담임'),
 ('공중', '도덕'),
 ('과반', '수가'),
 ('교전', '상태'),
 ('기본적', '인권'),
 ('내부', '규율'),
 ('노인과', '청소년')]

뉴스기사 적용

수집한거로 하니 잘 안나옴,,

균형이 잘 안잡혀있는 것 !

서로 다 다른이야기를 하고있음.

데이터 부족도 문제! ( 카테고리를 특정 짓지 못함 )

In [226]:
corpus = ""
for file in os.listdir("./News"):
    if file.startswith("세계") and file.endswith(".txt"):
        with open("./News/" + file, encoding='utf-8') as f:
            corpus += f.read()
            
# 단어 따로, 품사 따로 출력할 수 있음. 
tokens = [pair for pair in Kkma().nouns(corpus)]   # nouns는 명사만 가져옴 

# 쌍을 찾을 것
bigram = BigramCollocationFinder.from_words(tokens)
# pmi 가지고 Best 쌍 찾기
bigram.nbest(BigramAssocMeasures().pmi, 10)
# chi_sq 가지고 Best 쌍 찾기
# 단어를 다 별도로 인식했을 것. 
bigram.nbest(BigramAssocMeasures().chi_sq, 10)
Out[226]:
[('1', '1시간'),
 ('10', '10월'),
 ('100', '보'),
 ('100건', '예방'),
 ('10배', '100건'),
 ('10월', '라이'),
 ('10일', '여객기'),
 ('11', '11일'),
 ('11월', '후난성'),
 ('11일', '인도네시아')]


728x90
반응형