728x90
반응형
In [1]:
from konlpy.corpus import kobill
def getLexicon():
lexicon = list()
for docName in kobill.fileids():
document = kobill.open(docName).read()
for token in document.split():
if token not in lexicon:
lexicon.append(token)
return lexicon
In [2]:
%timeit getLexicon()
In [3]:
def getLexiconBySet():
lexicon = list()
for docName in kobill.fileids():
document = kobill.open(docName).read()
for token in document.split():
lexicon.append(token)
return list(set(lexicon))
In [4]:
lexicon = getLexiconBySet()
In [5]:
def getDocRepr(lexicon):
docList = list() # 문서목록
docRepr = list() # 문서표현 of BOW의 집합 => 문서갯수만큼
for docName in kobill.fileids(): # 문서 한개 열고
document = kobill.open(docName).read()
docList.append(docName) # 문서목록에 추가
docVector = list(0 for _ in range(len(lexicon)))
# 문서표현(BOW) = [0] * 단어의 갯수(|Lexicon|)
for token in document.split(): # 문서내 단어
if token in lexicon: # 사전에 있는지
docVector[lexicon.index(token)] = 1
# 사전에 있으면, 사전의 단어 위치(index)에 1
docRepr.append(docVector) # 문서 Vector Append
return docList, docRepr
In [6]:
def getDocReprByDict(lexicon):
docRepr = dict()
# key = 문서
# value = BOW => list X, dict
for docName in kobill.fileids():
document = kobill.open(docName).read()
docRepr[docName] = dict()
for token in document.split():
if token in lexicon:
docRepr[docName][lexicon.index(token)] = 1
return docRepr
In [9]:
%timeit getDocReprByDict(lexicon)
In [10]:
docRepr = getDocReprByDict(lexicon)
In [12]:
docRepr['1809890.txt'][23], lexicon[23]
Out[12]:
In [13]:
from collections import defaultdict
def getDocReprByDefaultDict(lexicon):
docRepr = defaultdict(lambda: defaultdict(int))
for docName in kobill.fileids():
document = kobill.open(docName).read()
for token in document.split():
docRepr[docName][token] = 1
return docRepr
In [14]:
docRepr = getDocReprByDefaultDict(lexicon)
In [15]:
docRepr['1809890.txt']
Out[15]:
In [16]:
# invertedDocument (역문헌구조, 어휘)
def invertedDocument(DTM):
TDM = defaultdict(lambda: defaultdict(int))
# Only python => Dictionary | Posting DB
# Dictionary => term, fp (Hash in memory)
# Posing => struct(docid, freq, next=fp) (FileDB)
for docName, docVector in DTM.items():
for term, freq in docVector.items():
TDM[term][docName] = freq
return TDM
In [17]:
TDM = invertedDocument(docRepr)
In [18]:
TDM["국회"], TDM["의원"]
Out[18]:
In [19]:
TDM["국회"] or TDM["의원"]
Out[19]:
In [ ]:
728x90
반응형