728x90
반응형
In [1]:
sentence = "The little bear saw the fat trout in the book"
In [100]:
import nltk
from nltk.tokenize import word_tokenize
from nltk import pos_tag
In [6]:
tagged = pos_tag(word_tokenize(sentence))
In [7]:
tagged # 리스트 속 튜플
Out[7]:
In [8]:
from nltk.help import upenn_tagset
In [11]:
upenn_tagset("N,*")
In [13]:
from nltk.chunk.regexp import RegexpParser
grammar = RegexpParser("NP: {<DT>?<NN.*>}")
pharseTree = grammar.parse(tagged)
# DT : 한정사
# ? : 있을 수도 있고 없을 수도 있고
# DT NN / NN / NNP / DT NNP 모두 다 가능함
In [15]:
pharseTree.pprint()
In [18]:
# Tree 출력
pharseTree
# pharseTree.draw() 하면 새로운 창에 뜸
Out[18]:
In [26]:
sentence = "The little yellow dog barked at the cat"
tagged = pos_tag(word_tokenize(sentence))
grammar = RegexpParser("""
NP : {<DT>?<JJ.*>*<NN.*>}
""")
# ? = True or False ( 있거나 없거나 딱 1개)
# * = 0 : * ( 없거나 여러개 있거나 )
# + = 1 : * ( 1개 있거나, 여러개 있거나 )
grammar.parse(tagged).pprint()
Tree = grammar.parse(tagged)
In [27]:
Tree
Out[27]:
In [53]:
# 꼬마
from konlpy.tag import Kkma
sentence = "내 친구가 잠을 많이 잔다."
tagged = Kkma().pos(sentence)
In [54]:
tagged
Out[54]:
In [55]:
rules = RegexpParser("""
NP : {<N.*>+<J.*>?}
VP : {<M.*>?<VV.*><E.*>}
""")
# 부사(M)은 있거나 없거나
In [57]:
# 한글이라 draw해야 한글 나옴
parseTree = rules.parse(tagged).draw()
parseTree
In [39]:
type(parseTree)
Out[39]:
In [41]:
for subTree in parseTree.subtrees():
if subTree.label() == "NP":
print(subTree)
In [382]:
sentence = "I shot an elephant in my pajamas"
tagged = pos_tag(word_tokenize(sentence))
In [383]:
tagged
Out[383]:
In [384]:
# 아래서부터 위로 정의해야함
rules = RegexpParser("""
Det : {<DT>?<PRP.+>?}
N : {<NN.*>}
NP : {<Det><N>}
V : {<VBP>}
P : {<I.*>}
VP : {<V><NP>}
NP : {<P><NP>}
VP : {<VP><NP>}
""")
rules.parse(tagged).pprint()
In [385]:
rules.parse(tagged)
Out[385]:
In [368]:
sentence = "the dog saw a man in the park"
tagged = pos_tag(word_tokenize(sentence))
In [369]:
tagged
Out[369]:
In [377]:
rules = RegexpParser("""
Det : {<DT>}
N : {<NN>}
V : {<VBD>}
P : {<IN>}
NP : {<Det><N>}
PP : {<P><NP>}
VP : {<V><NP>}
""")
rules.parse(tagged).pprint()
In [378]:
rules.parse(tagged)
Out[378]:
In [379]:
sentence = "the angry bear chased the frightened little squirrel"
tagged = pos_tag(word_tokenize(sentence))
In [380]:
tagged
Out[380]:
In [ ]:
rules = RegexpParser("""
""")
In [ ]:
In [387]:
from konlpy.corpus import kolaw
corpus = kolaw.open(kolaw.fileids()[0]).read()
tagged = Kkma().pos(corpus)
In [390]:
result =[]
# 조사 제거
for _ in tagged:
if len(_[0]) > 1 and not _[1].startswith("J"):
result.append(_[0])
In [392]:
from nltk import Text
textObj = Text(result)
In [402]:
termList = textObj.vocab()
In [417]:
from wordcloud import WordCloud
path = "/Library/Fonts/AppleGothic.ttf"
wc = WordCloud(font_path=path,max_words=30,background_color="white")
In [418]:
wc.generate_from_frequencies(termList)
wc.to_image()
Out[418]:
In [420]:
# Array로도 뽑을 수 있음
temp = wc.to_array()
In [450]:
import os
corpus = ""
for file in os.listdir("./News"):
if file.endswith(".txt") and file.startswith("IT과학"):
with open("./News/" + file, encoding='utf-8') as f:
corpus += f.read()
In [451]:
tagged=[]
for _ in word_tokenize(corpus):
tagged.extend(Kkma().nouns(_))
In [452]:
from collections import defaultdict
nounList = defaultdict(int)
for _ in tagged:
nounList[_] += 1
In [453]:
len(nounList), len(set(nounList))
Out[453]:
In [454]:
wc.generate_from_frequencies(nounList)
wc.to_image()
Out[454]:
728x90
반응형