728x90
반응형
In [173]:
collection = [
("Document1", "This is a sample"), # a 가 중요
("Document2","This is another sample"), # another 가 중요
("Document2","This is not sample") # not 이 중요
]
query = "this is a sample"
In [161]:
# 전역변수 만들기
# in-memory (Hash-key값)
# {단어1 : 포스팅위치, 단어2: 포스팅위치, ...}
globalLexicon = dict()
# [0:문서, 1:문서2, ...]
globalDocument = list()
# disk ( out memory )
# [0:(단어 idx, 문서 idx, 빈도, 다음주소), ...]
# 메모리에 존재안하고 파일에만 존재해도 됨, 위치를 알고 있어서
globalPosting = list()
In [162]:
for (docName, docContent) in collection:
# Pointer 대체용, Key, Document이름은 절대로 겹치지 않는다는 가정
docIdx = len(globalDocument)
globalDocument.append(docName)
# {단어idx:빈도, 단어idx:빈도, ...}
localPosting = dict()
# 로컬 / 띄어쓰기 단위로
for term in docContent.lower().split():
if term not in localPosting.keys():
localPosting[term] = 1
else:
localPosting[term] += 1
# fp -> struct(단어,빈도) ( localPosting)
# Merge와 sorting이 같이 있는 것
for indexTerm, termFreq in localPosting.items():
if indexTerm not in globalLexicon.keys():
lexiconIdx = len(globalLexicon)
postingIdx = len(globalPosting) # fseek
postingData = (lexiconIdx,docIdx,termFreq, -1)
globalPosting.append(postingData)
globalLexicon[indexTerm] = postingIdx # globalPosting 위치 (ptr:idx)
else:
lexiconIdx = list(globalLexicon.keys()).index(indexTerm)
postingIdx = len(globalPosting)
beforeIdx = globalLexicon[indexTerm]
postingData = (lexiconIdx,docIdx,termFreq, beforeIdx)
globalPosting.append(postingData)
globalLexicon[indexTerm] = postingIdx # globalPosting 위치 (ptr:idx)
# print(localPosting)
# print(globalDocument)
# if term not in globalLexicon.keys():
# localPosting
# lexiconIdx = len(globalLexicon) # 처음엔 Length가 0일 것.
In [163]:
len(globalDocument)
Out[163]:
In [164]:
globalDocument.index("Document1")
Out[164]:
In [165]:
globalLexicon
Out[165]:
In [166]:
globalPosting
# 끝이 -1 이면 끝
# 세번째 인덱스가 빈도
Out[166]:
In [167]:
for indexTerm, postingIdx in globalLexicon.items():
# indexTerm : 단어, postingIdx : 위치,
print(indexTerm)
while True: # Postrin Next : -1 -> -1이 될 때 까지 찾으려고
if postingIdx == -1:
break
postingData = globalPosting[postingIdx]
print( " {0} / {1} / {2} ".format(globalDocument[postingData[1]],postingData[2],postingData[3]))
postingIdx = postingData[3]
In [168]:
from math import log10
def rawTF(freq):
return freq
def normTF(freq,totalCount):
return (freq / totalCount)
def logTF(freq):
if freq > 0:
return 1 + log10(freq)
else:
return 0
def maxTF(a,freq,maxFreq): # double normalization K - doc : 0 / query : 0.5
return a + ((1-a)* (freq/maxFreq))
In [169]:
globalTF = list()
for (docName, docContent) in collection:
localPosting = dict()
maxCount = 0
# 로컬 / 띄어쓰기 단위로
for term in docContent.lower().split():
maxCount += 1
if term not in localPosting.keys():
localPosting[term] = 1
else:
localPosting[term] += 1
print(docName)
a = 0.5
maxFreq = max(localPosting.values())
for term,freq in localPosting.items():
print("1. {0} rawTF : {1}".format(term,rawTF(freq)))
print("2. {0} normTF : {1}".format(term,normTF(freq,maxCount)))
print("3. {0} logTF : {1}".format(term,logTF(freq)))
print("4. {0} maxTF : {1}".format(term,maxTF(a,freq,maxFreq)))
print()
localPosting[term] = maxTF(a,freq,maxFreq)
for indexTerm, termTF in localPosting.items():
if indexTerm not in globalLexicon.keys():
lexiconIdx = len(globalLexicon)
postingIdx = len(globalTF) # fseek
postingData = (lexiconIdx,docIdx,termTF, -1)
globalTF.append(postingData)
globalLexicon[indexTerm] = postingIdx # globalPosting 위치 (ptr:idx)
else:
lexiconIdx = list(globalLexicon.keys()).index(indexTerm)
postingIdx = len(globalTF)
beforeIdx = globalLexicon[indexTerm]
postingData = (lexiconIdx,docIdx,termTF, beforeIdx)
globalTF.append(postingData)
globalLexicon[indexTerm] = postingIdx
In [170]:
print(globalPosting), print(globalTF)
Out[170]:
In [317]:
# 일반적인 IDF
def rawIdf(df, N):
return log10(N / df)
# the,a, 불용어 안날림 => to be or not to be
def smoothingIdf(df,N):
return log10((N+1) / df)
def probabilityIdf(df,N):
return log10((N-df+1) / df)
In [318]:
collection = [
("Document1", "This is a sample"), # a 가 중요
("Document2","This is another sample"), # another 가 중요
("Document3","This is not sample"), # not 이 중요
]
query = "this is sample"
In [319]:
globalLexicon = dict()
globalDocument = list()
globalPosting = list()
for (docName, docContent) in collection:
# Pointer 대체용, Key, Document이름은 절대로 겹치지 않는다는 가정
docIdx = len(globalDocument)
globalDocument.append(docName)
# {단어idx:빈도, 단어idx:빈도, ...}
localPosting = dict()
# 로컬 / 띄어쓰기 단위로
for term in docContent.lower().split():
if term not in localPosting.keys():
localPosting[term] = 1
else:
localPosting[term] += 1
maxFreq = max(localPosting.values())
# fp -> struct(단어,빈도) ( localPosting)
# Merge와 sorting이 같이 있는 것
for indexTerm, termFreq in localPosting.items():
if indexTerm not in globalLexicon.keys():
lexiconIdx = len(globalLexicon)
postingIdx = len(globalPosting) # fseek
postingData = [lexiconIdx,docIdx,maxTF(0,termFreq,maxFreq), -1]
globalPosting.append(postingData)
globalLexicon[indexTerm] = postingIdx # globalPosting 위치 (ptr:idx)
else:
lexiconIdx = list(globalLexicon.keys()).index(indexTerm)
postingIdx = len(globalPosting)
beforeIdx = globalLexicon[indexTerm]
postingData = [lexiconIdx,docIdx,maxTF(0,termFreq,maxFreq), beforeIdx]
globalPosting.append(postingData)
globalLexicon[indexTerm] = postingIdx # globalPosting 위치 (ptr:idx)
# print(localPosting)
# print(globalDocument)
# if term not in globalLexicon.keys():
# localPosting
# lexiconIdx = len(globalLexicon) # 처음엔 Length가 0일 것.
In [320]:
globalPosting
# 가중치가 다 1.0
Out[320]:
In [321]:
# 최종 코드 ( TF & IDF )
N = len(globalDocument)
globalLexiconIDF = dict()
for indexTerm, postingIdx in globalLexicon.items():
df = 0
oldPostingIdx = postingIdx
while True:
if postingIdx == -1:
break
df += 1
postingData = globalPosting[postingIdx]
postingIdx = postingData[3]
postingIdx = oldPostingIdx
idf = rawIdf(df,N)
globalLexiconIDF[indexTerm] = idf
print("{0} / IDF-{1}".format(indexTerm,idf))
while True:
if postingIdx == -1:
break
postingData = globalPosting[postingIdx]
TF = postingData[2]
postingData[2] = postingData[2] * idf
print(" Document:{0} / TF:{1} / TF-IDF:{2}".format(globalDocument[postingData[1]],
TF,
globalPosting[postingIdx][2]))
postingIdx = postingData[3]
In [322]:
def euclidean(x,y):
return (x - y) ** 2
In [323]:
query # Document
queryPosting = dict()
for term in query.lower().split():
if term not in queryPosting.keys():
queryPosting[term] = 1
else:
queryPosting[term] += 1
maxFreq = max(queryPosting.values())
# fp -> struct(단어,빈도) ( localPosting)
# Merge와 sorting이 같이 있는 것
for indexTerm, termFreq in queryPosting.items():
queryPosting[indexTerm] = maxTF(0.5,termFreq,maxFreq)
In [324]:
queryPosting
Out[324]:
In [325]:
globalLexicon.items()
Out[325]:
In [326]:
candidateList = dict() # 아직까진 검색후보군이어서
for indexTerm, postingIdx in globalLexicon.items():
queryTFIDF = 0
if indexTerm in queryPosting.keys():
queryTFIDF = queryPosting[indexTerm] * globalLexiconIDF[indexTerm]
while True:
if postingIdx == -1:
break
postingData = globalPosting[postingIdx]
postingIdx = postingData[3]
documentWeight = postingData[2]
if postingData[1] not in candidateList.keys():
candidateList[postingData[1]] = euclidean(queryTFIDF,documentWeight) # 각 다큐먼트마다 누적시켜야함
else:
candidateList[postingData[1]] += euclidean(queryTFIDF,documentWeight)
In [327]:
resultList = sorted(candidateList.items(), key=lambda x:x[1])
for i, (documentIdx, distance) in enumerate(resultList):
print("순위: {0}. 문서 : {1} / Distance:{2}".format((i+1),globalDocument[documentIdx], distance))
In [328]:
query = "this is a sample"
queryPosting = dict()
for term in query.lower().split():
if term not in queryPosting.keys():
queryPosting[term] = 1
else:
queryPosting[term] += 1
maxFreq = max(queryPosting.values())
# fp -> struct(단어,빈도) ( localPosting)
# Merge와 sorting이 같이 있는 것
for indexTerm, termFreq in queryPosting.items():
queryPosting[indexTerm] = maxTF(0.5,termFreq,maxFreq)
candidateList = dict() # 아직까진 검색후보군이어서
for indexTerm, postingIdx in globalLexicon.items():
queryTFIDF = 0
if indexTerm in queryPosting.keys():
queryTFIDF = queryPosting[indexTerm] * globalLexiconIDF[indexTerm]
while True:
if postingIdx == -1:
break
postingData = globalPosting[postingIdx]
postingIdx = postingData[3]
documentWeight = postingData[2]
if postingData[1] not in candidateList.keys():
candidateList[postingData[1]] = euclidean(queryTFIDF,documentWeight) # 각 다큐먼트마다 누적시켜야함
else:
candidateList[postingData[1]] += euclidean(queryTFIDF,documentWeight)
resultList = sorted(candidateList.items(), key=lambda x:x[1])
for i, (documentIdx, distance) in enumerate(resultList):
print("순위: {0}. 문서 : {1} / Distance:{2}".format((i+1),globalDocument[documentIdx], distance))
print(" {0}".format(collection[documentIdx][1]))
# 거리가 0인것은 아예 같다는 것.
In [348]:
collection = [
("Document1", "This is a sample"), # a 가 중요
("Document2","This is another sample"), # another 가 중요
("Document3","This is not sample"), # not 이 중요
("Document4","a not sample"),
("Document5","not"),
("Document5","not sample"),
]
query = "this is sample"
In [349]:
globalLexicon = dict()
globalDocument = list()
globalPosting = list()
for (docName, docContent) in collection:
# Pointer 대체용, Key, Document이름은 절대로 겹치지 않는다는 가정
docIdx = len(globalDocument)
globalDocument.append(docName)
# {단어idx:빈도, 단어idx:빈도, ...}
localPosting = dict()
# 로컬 / 띄어쓰기 단위로
for term in docContent.lower().split():
if term not in localPosting.keys():
localPosting[term] = 1
else:
localPosting[term] += 1
maxFreq = max(localPosting.values())
# fp -> struct(단어,빈도) ( localPosting)
# Merge와 sorting이 같이 있는 것
for indexTerm, termFreq in localPosting.items():
if indexTerm not in globalLexicon.keys():
lexiconIdx = len(globalLexicon)
postingIdx = len(globalPosting) # fseek
postingData = [lexiconIdx,docIdx,maxTF(0,termFreq,maxFreq), -1]
globalPosting.append(postingData)
globalLexicon[indexTerm] = postingIdx # globalPosting 위치 (ptr:idx)
else:
lexiconIdx = list(globalLexicon.keys()).index(indexTerm)
postingIdx = len(globalPosting)
beforeIdx = globalLexicon[indexTerm]
postingData = [lexiconIdx,docIdx,maxTF(0,termFreq,maxFreq), beforeIdx]
globalPosting.append(postingData)
globalLexicon[indexTerm] = postingIdx # globalPosting 위치 (ptr:idx)
# print(localPosting)
# print(globalDocument)
# if term not in globalLexicon.keys():
# localPosting
# lexiconIdx = len(globalLexicon) # 처음엔 Length가 0일 것.
In [350]:
# 최종 코드 ( TF & IDF )
N = len(globalDocument)
globalLexiconIDF = dict()
for indexTerm, postingIdx in globalLexicon.items():
df = 0
oldPostingIdx = postingIdx
while True:
if postingIdx == -1:
break
df += 1
postingData = globalPosting[postingIdx]
postingIdx = postingData[3]
postingIdx = oldPostingIdx
idf = smoothingIdf(df,N)
globalLexiconIDF[indexTerm] = idf
print("{0} / IDF-{1}".format(indexTerm,idf))
while True:
if postingIdx == -1:
break
postingData = globalPosting[postingIdx]
TF = postingData[2]
postingData[2] = postingData[2] * idf
print(" Document:{0} / TF:{1} / TF-IDF:{2}".format(globalDocument[postingData[1]],
TF,
globalPosting[postingIdx][2]))
postingIdx = postingData[3]
In [362]:
query = "not"
queryPosting = dict()
for term in query.lower().split():
if term not in queryPosting.keys():
queryPosting[term] = 1
else:
queryPosting[term] += 1
maxFreq = max(queryPosting.values())
# fp -> struct(단어,빈도) ( localPosting)
# Merge와 sorting이 같이 있는 것
for indexTerm, termFreq in queryPosting.items():
queryPosting[indexTerm] = maxTF(0.5,termFreq,maxFreq)
candidateList = dict() # 아직까진 검색후보군이어서
for indexTerm, postingIdx in globalLexicon.items():
queryTFIDF = 0
if indexTerm in queryPosting.keys():
queryTFIDF = queryPosting[indexTerm] * globalLexiconIDF[indexTerm]
while True:
if postingIdx == -1:
break
postingData = globalPosting[postingIdx]
postingIdx = postingData[3]
documentWeight = postingData[2]
if postingData[1] not in candidateList.keys():
candidateList[postingData[1]] = euclidean(queryTFIDF,documentWeight) # 각 다큐먼트마다 누적시켜야함
else:
candidateList[postingData[1]] += euclidean(queryTFIDF,documentWeight)
resultList = sorted(candidateList.items(), key=lambda x:x[1])
print(query)
for i, (documentIdx, distance) in enumerate(resultList):
print("순위: {0}. 문서 : {1} / Distance:{2}".format((i+1),globalDocument[documentIdx], distance))
print(" {0}".format(collection[documentIdx][1]))
# 거리가 0인것은 아예 같다는 것.
In [363]:
def innerProduct(x,y):
return x * y
In [364]:
candidateList = dict()
for indexTerm, queryWeight in queryPosting.items():
if indexTerm in globalLexicon.keys():
postingIdx = globalLexicon[indexTerm]
while True:
if postingIdx == -1:
break
postingData = globalPosting[postingIdx]
postingIdx = postingData[3]
documentWeight = postingData[2]
if postingData[1] not in candidateList.keys():
candidateList[postingData[1]] = innerProduct(queryWeight,documentWeight) # 각 다큐먼트마다 누적시켜야함
else:
candidateList[postingData[1]] += innerProduct(queryWeight,documentWeight)
for documentIdx, sumProduct in candidateList.items():
candidateList[documentIdx] /= globalDocumentLength[documentIdx]
In [365]:
# 최종 코드 ( TF & IDF )
N = len(globalDocument)
globalLexiconIDF = dict()
globalDocumentLength = dict()
for indexTerm, postingIdx in globalLexicon.items():
df = 0
oldPostingIdx = postingIdx
while True:
if postingIdx == -1:
break
df += 1
postingData = globalPosting[postingIdx]
postingIdx = postingData[3]
postingIdx = oldPostingIdx
idf = smoothingIdf(df,N)
globalLexiconIDF[indexTerm] = idf
print("{0} / IDF-{1}".format(indexTerm,idf))
while True:
if postingIdx == -1:
break
postingData = globalPosting[postingIdx]
TF = postingData[2]
postingData[2] = postingData[2] * idf
print(" Document:{0} / TF:{1} / TF-IDF:{2}".format(globalDocument[postingData[1]],
TF,
globalPosting[postingIdx][2]))
postingIdx = postingData[3]
if postingData[1] not in globalDocumentLength.keys():
globalDocumentLength[postingData[1]] = globalPosting[postingIdx][2] ** 2 # 각 다큐먼트마다 누적시켜야함
else:
globalDocumentLength[postingData[1]] += globalPosting[postingIdx][2] ** 2
In [367]:
resultList = sorted(candidateList.items(), key=lambda x:x[1], reverse=True)
print(query)
for i, (documentIdx, distance) in enumerate(resultList):
print("순위: {0}. 문서 : {1} / 유사도:{2}".format((i+1),globalDocument[documentIdx], distance))
print(" {0}".format(collection[documentIdx][1]))
# 거리가 0인것은 아예 같다는 것.
In [401]:
from konlpy.corpus import kobill
def getLexiconBySet():
lexicon = list()
for docName in kobill.fileids():
document = kobill.open(docName).read()
for token in document.split():
lexicon.extend(token)
return list(set(lexicon))
In [402]:
from collections import defaultdict
from konlpy.tag import Kkma
ma = Kkma().morphs
def getDocReprByDefaultDict(lexicon):
docRepr = defaultdict(lambda: defaultdict(int))
for docName in kobill.fileids():
document = kobill.open(docName).read()
for token in document.split():
for morpheme in ma(token):
docRepr[docName][morpheme] += 1
return docRepr
In [408]:
txt = getLexiconBySet()
DTM = getDocReprByDefaultDict(txt)
In [412]:
# invertedDocument (역문헌구조, 어휘)
def convertInvertedDocument(DTM):
TDM = defaultdict(lambda: defaultdict(int))
for fileName, termList in DTM.items():
maxFreq = max(termList.values())
for term, freq in termList.items():
TDM[term][fileName] = maxTF(0,freq,maxFreq)
return TDM
In [434]:
# 빈도로 나타내기 위해 기존의 getDocReprByDefaultDict의= 를 +=로 변경
TDM = convertInvertedDocument(DTM)
#TDM
In [414]:
# term-document -> term weight
# defaultdict 를 써서 key 걱정을 안해도 된다.
N = len(DTM)
def TDM2TWM(TDM):
TWM = defaultdict(lambda: defaultdict(float))
DVL = defaultdict(float)
for term, tfList in TDM.items():
df = len(tfList)
idf = rawIdf(df,N)
for fileName, tf in tfList.items():
TWM[term][fileName] = tf * idf
DVL[fileName] += TWM[term][fileName] ** 2
return TWM, DVL
In [415]:
TWM,DVL = TDM2TWM(TDM)
In [433]:
#TWM
In [417]:
DVL
Out[417]:
In [423]:
query = "국방의 의무와 보편적 교육에 대한 법안"
In [425]:
queryRepr = defaultdict(int) # 빈도를 갖는
for token in query.split():
for morpheme in ma(token):
queryRepr[morpheme] += 1
queryWeight = defaultdict(float)
maxFreq = max(queryRepr.values())
for token, freq in queryRepr.items():
if token in TWM.keys():
tf = maxTF(0.5,freq,maxFreq)
df = len(TWM[token])
idf = rawIdf(df,N)
queryWeight[token] = tf * idf
In [426]:
queryWeight
Out[426]:
In [435]:
from math import sqrt
candidateList = defaultdict(float)
for token, weight in queryWeight.items():
for fileName, tfidf in TWM[token].items():
print(" {0} : {1} = {2} * {3}".format(
token, fileName,weight,tfidf))
candidateList[fileName] += innerProduct(weight, tfidf)
for fileName, sumProduct in candidateList.items():
candidateList[fileName] /= sqrt(DVL[fileName])
In [439]:
from nltk.tokenize import sent_tokenize
K=5
resultList = sorted(candidateList.items(), key = lambda x:x[1], reverse=True)
for i,(fileName, similarity) in enumerate(resultList):
if i < K:
print(" Rank:{0} / Document:{1} / Similarity:{2:.4f}".format((i+1),fileName,similarity))
content = kobill.open(fileName).read()
print(sent_tokenize(content)[:5])
728x90
반응형