728x90
반응형
In [2]:
import os
def getFileList(base='./', ext='.txt'):
fileList = list()
for file in os.listdir(base):
if file.endswith(ext): # == if file.split('.')[-1] == ext:
fileList.append('{0}/{1}'.format(base, file))
return fileList
In [3]:
# def getNewsRank():
# for file in os.listdir(base):
# with open(file,encoding='utf-8') as f:
# content = f.read()
# return content
In [4]:
def getContent(file):
with open(file, encoding='utf-8') as f:
content = f.read()
return content
In [5]:
corpus= list()
for a in getFileList('./News'):
document = getContent(a)
corpus.append(document)
In [6]:
type(corpus)
Out[6]:
In [7]:
len(corpus)
Out[7]:
In [8]:
def ngramEojeol(sentence, n=2):
tokens = sentence.split()
ngram = []
for i in range(len(tokens) - n + 1):
ngram.append(' '.join(tokens[i:i + n]))
return ngram
In [9]:
def ngramUmjeol(term, n=2):
ngram = []
for i in range(len(term) - n + 1):
ngram.append(''.join(term[i:i + n]))
return ngram
In [10]:
def getPatternList():
patternList = {}
patternList['Korean'] = re.compile(r'([^ㄱ-ㅎㅏ-ㅣ가-힣]+)')
patternList['rmeng'] = re.compile(r'[a-zA-Z]')
patternList['Email'] = re.compile(r'(\w+@[a-zA-Z0-9\-\_]{3,}(.[a-zA-Z]{2,})+)')
patternList['Whitespace'] = re.compile(r'\s{2,}')
patternList['Punctuation'] = re.compile(r'[%s]{2,}' % re.escape(punctuation))
patternList['Punctuation2'] = re.compile(r'[%s]' % re.escape(punctuation))
return patternList
In [11]:
import re
from string import punctuation
corpus2 = list()
for i in range(len(corpus)):
corpus[i] = getPatternList()['Email'].sub(" ", corpus[i])
corpus[i] = getPatternList()['Whitespace'].sub(" ", corpus[i])
corpus[i] = getPatternList()['Punctuation'].sub(" ", corpus[i])
corpus[i] = getPatternList()['Punctuation2'].sub(" ", corpus[i])
corpus[i] = getPatternList()['rmeng'].sub("", corpus[i])
# corpus[i] = getPatternList()['Nonword'].sub(" ", corpus[i])
# corpus[i] = getPatternList()['Numeric'].sub(" ", corpus[i])
# corpus[i] = getPatternList()['Maxlength'].sub(" ", corpus[i])
corpus2.append(corpus[i])
In [12]:
from nltk.tokenize import sent_tokenize
newCorpus = list()
for i in range(len(corpus)):
tokenNews = sent_tokenize(corpus[i])
newCorpus.extend(tokenNews)
In [13]:
# sent_tokenize한 corpus
# newCorpus
In [14]:
from konlpy.tag import Kkma
dictTerm = list()
dictPos = list()
dictNoun = list()
dictNgram = list()
for sentence in newCorpus:
for token in sentence.split():
if len(token) > 1:
dictTerm.append(token)
dictPos.extend([morpheme for morpheme in Kkma().morphs(token) if len(morpheme) > 1])
dictNoun.extend([noun for noun in Kkma().nouns(token) if len(noun) > 1])
dictNgram.extend(ngramUmjeol(token))
dictTerm = list(set(dictTerm))
dictPos = list(set(dictPos))
dictNoun = list(set(dictNoun))
dictNgram = list(set(dictNgram))
In [15]:
len(dictTerm), len(dictPos), len(dictNoun), len(dictNgram)
Out[15]:
In [657]:
# print(dictTerm)
In [658]:
# print(dictPos)
In [659]:
# print(dictNoun)
In [660]:
# print(dictNgram)
In [20]:
def getNewsBySet():
corpus3 = list()
for docName in getFileList('./News'):
document = getContent(docName)
for token in document.split():
corpus3.append(token)
return list(set(corpus3))
In [661]:
txt = getNewsBySet()
# txt
In [22]:
from konlpy.tag import Kkma
from collections import defaultdict
ma = Kkma().morphs
def getNewsReprByDefaultDict(document):
docRepr = defaultdict(lambda: defaultdict(int))
for docName in getFileList('./News'):
document = getContent(docName)
for token in document.split():
for morpheme in ma(token):
docRepr[docName][morpheme] += 1
return docRepr
In [ ]:
DTM = getNewsReprByDefaultDict(txt)
# DTM
In [24]:
from math import log10
def rawTF(freq):
return freq
def normTF(freq,totalCount):
return (freq / totalCount)
def logTF(freq):
if freq > 0:
return 1 + log10(freq)
else:
return 0
def maxTF(a,freq,maxFreq): # double normalization K - doc : 0 / query : 0.5
return a + ((1-a)* (freq/maxFreq))
In [25]:
def convertInvertedDocument(DTM):
TDM = defaultdict(lambda: defaultdict(int))
for fileName, termList in DTM.items():
maxFreq = max(termList.values())
for term, freq in termList.items():
TDM[term][fileName] = maxTF(0,freq,maxFreq)
return TDM
In [ ]:
TDM = convertInvertedDocument(DTM)
# TDM
In [27]:
# 일반적인 IDF
def rawIdf(df, N):
return log10(N / df)
# the,a, 불용어 안날림 => to be or not to be
def smoothingIdf(df,N):
return log10((N+1) / df)
def probabilityIdf(df,N):
return log10((N-df+1) / df)
In [28]:
# term-document -> term weight
# defaultdict 를 써서 key 걱정을 안해도 된다.
N = len(DTM)
def TDM2TWM(TDM):
TWM = defaultdict(lambda: defaultdict(float))
DVL = defaultdict(float)
for term, tfList in TDM.items():
df = len(tfList)
idf = rawIdf(df,N)
for fileName, tf in tfList.items():
TWM[term][fileName] = tf * idf
DVL[fileName] += TWM[term][fileName] ** 2
return TWM, DVL
In [29]:
TWM,DVL = TDM2TWM(TDM)
In [ ]:
# TWM
In [ ]:
globalTF = list()
globalDocument = list()
for (docName, docContent) in enumerate(newCorpus):
docIdx = len(globalDocument)
globalDocument.append(docName)
localPosting = dict()
maxCount = 0
# 로컬 / 띄어쓰기 단위로
for term in docContent.lower().split():
maxCount += 1
if term not in localPosting.keys():
localPosting[term] = 1
else:
localPosting[term] += 1
print(docName)
a = 0.5
maxFreq = max(localPosting.values())
for term,freq in localPosting.items():
# print("1. {0} rawTF : {1}".format(term,rawTF(freq)))
# print("2. {0} normTF : {1}".format(term,normTF(freq,maxCount)))
# print("3. {0} logTF : {1}".format(term,logTF(freq)))
# print("4. {0} maxTF : {1}".format(term,maxTF(a,freq,maxFreq)))
# print()
localPosting[term] = maxTF(a,freq,maxFreq)
for indexTerm, termTF in localPosting.items():
if indexTerm not in localPosting.keys():
lexiconIdx = len(localPosting)
postingIdx = len(globalTF) # fseek
postingData = (lexiconIdx,docIdx,termTF, -1)
globalTF.append(postingData)
localPosting[indexTerm] = postingIdx # globalPosting 위치 (ptr:idx)
else:
lexiconIdx = list(localPosting.keys()).index(indexTerm)
postingIdx = len(globalTF)
beforeIdx = localPosting[indexTerm]
postingData = (lexiconIdx,docIdx,termTF, beforeIdx)
globalTF.append(postingData)
localPosting[indexTerm] = postingIdx
In [32]:
globalLexicon = dict()
globalDocument = list()
globalPosting = list()
for (docName, docContent) in enumerate(newCorpus):
# Pointer 대체용, Key, Document이름은 절대로 겹치지 않는다는 가정
docIdx = len(globalDocument)
globalDocument.append(docName)
# {단어idx:빈도, 단어idx:빈도, ...}
localPosting = dict()
# 로컬 / 띄어쓰기 단위로
for term in docContent.lower().split():
if term not in localPosting.keys():
localPosting[term] = 1
else:
localPosting[term] += 1
maxFreq = max(localPosting.values())
# fp -> struct(단어,빈도) ( localPosting)
# Merge와 sorting이 같이 있는 것
for indexTerm, termFreq in localPosting.items():
if indexTerm not in globalLexicon.keys():
lexiconIdx = len(globalLexicon)
postingIdx = len(globalPosting) # fseek
postingData = [lexiconIdx,docIdx,maxTF(0,termFreq,maxFreq), -1]
globalPosting.append(postingData)
globalLexicon[indexTerm] = postingIdx # globalPosting 위치 (ptr:idx)
else:
lexiconIdx = list(globalLexicon.keys()).index(indexTerm)
postingIdx = len(globalPosting)
beforeIdx = globalLexicon[indexTerm]
postingData = [lexiconIdx,docIdx,maxTF(0,termFreq,maxFreq), beforeIdx]
globalPosting.append(postingData)
globalLexicon[indexTerm] = postingIdx # globalPosting 위치 (ptr:idx)
# print(localPosting)
# print(globalDocument)
# if term not in globalLexicon.keys():
# localPosting
# lexiconIdx = len(globalLexicon) # 처음엔 Length가 0일 것.
In [33]:
query = "서울시에 거래되는 아파트 전세값은?"
In [34]:
queryRepr = defaultdict(int) # 빈도를 갖는
for token in query.split():
for morpheme in ma(token):
queryRepr[morpheme] += 1
queryWeight = defaultdict(float)
maxFreq = max(queryRepr.values())
for token, freq in queryRepr.items():
if token in TWM.keys():
tf = maxTF(0.5,freq,maxFreq)
df = len(TWM[token])
idf = rawIdf(df,N)
queryWeight[token] = tf * idf
In [35]:
queryWeight
Out[35]:
In [48]:
def innerProduct(x,y):
return x * y
In [ ]:
from math import sqrt
candidateList = defaultdict(float)
for token, weight in queryWeight.items():
for fileName, tfidf in TWM[token].items():
# print(" {0} : {1} = {2} * {3}".format(
token, fileName,weight,tfidf))
candidateList[fileName] += innerProduct(weight, tfidf)
for fileName, sumProduct in candidateList.items():
candidateList[fileName] /= sqrt(DVL[fileName])
In [37]:
from nltk.tokenize import sent_tokenize
K=5
resultList = sorted(candidateList.items(), key = lambda x:x[1], reverse=True)
for i,(fileName, similarity) in enumerate(resultList):
if i < K:
print(" Rank:{0} / Document:{1} / Similarity:{2:.4f}".format((i+1),fileName,similarity))
with open(fileName,encoding='utf-8') as f:
content = f.read()
content = sent_tokenize(content)
print(content[:5])
728x90
반응형