728x90
반응형
In [665]:
import os
def getFileList(base='./', ext='.txt'):
fileList = list()
for file in os.listdir(base):
if file.endswith(ext): # == if file.split('.')[-1] == ext:
fileList.append('{0}/{1}'.format(base, file))
return fileList
In [666]:
def getContent(file):
with open(file, encoding='utf-8') as f:
content = f.read()
return content
In [667]:
cat = getFileList('./News')
catList = list()
for c in cat:
v = c.split('/')
v2 = v[2].split('_')
catList.append(v2[0])
catList = list(set(catList))
In [668]:
import os
corpus = list()
for file in os.listdir("./News"):
for cat in catList:
if file.startswith(cat):
with open("./News/" + file, encoding='utf-8') as f:
corpus.append([f.read(),cat])
In [669]:
trainingSet = corpus
In [670]:
# 정치
trainingSetPol = [d for d in trainingSet if d[1] == "정치"]
# 사회
trainingSetSoc = [d for d in trainingSet if d[1] == "사회"]
# 경제
trainingSetFin = [d for d in trainingSet if d[1] == "경제"]
# 세계
trainingSetWorld = [d for d in trainingSet if d[1] == "세계"]
# 생활문화
trainingSetLife = [d for d in trainingSet if d[1] == "생활문화"]
# IT과학
trainingSetIT = [d for d in trainingSet if d[1] == "IT과학"]
In [671]:
# doc의 전체 갯수 구하기
N = len(trainingSet)
N
Out[671]:
In [672]:
len(trainingSetPol), len(trainingSetSoc), len(trainingSetFin), len(trainingSetWorld), len(trainingSetWorld), len(trainingSetLife), len(trainingSetIT)
Out[672]:
In [673]:
from string import punctuation
In [674]:
range(len(trainingSetPol))
Out[674]:
In [675]:
import re
PolNews = list()
SocNews = list()
FinNews = list()
WorldNews = list()
LifeNews = list()
ITNews = list()
pattern = re.compile(r'[%s]' % re.escape(punctuation))
for i in range(len(trainingSetPol)):
trainingSetPol[i][0] = trainingSetPol[i][0].translate(str.maketrans('', '', punctuation))
trainingSetPol[i][0] = trainingSetPol[i][0].replace('flash 오류를 우회하기 위한 함수 추가\nfunction flashremoveCallback', '')
PolNews.append((trainingSetPol[i][0],trainingSetPol[i][1]))
for i in range(len(trainingSetSoc)):
trainingSetSoc[i][0] = trainingSetSoc[i][0].translate(str.maketrans('', '', punctuation))
trainingSetSoc[i][0] = trainingSetSoc[i][0].replace('flash 오류를 우회하기 위한 함수 추가\nfunction flashremoveCallback', '')
SocNews.append((trainingSetSoc[i][0],trainingSetSoc[i][1]))
for i in range(len(trainingSetFin)):
trainingSetFin[i][0] = trainingSetFin[i][0].translate(str.maketrans('', '', punctuation))
trainingSetFin[i][0] = trainingSetFin[i][0].replace('flash 오류를 우회하기 위한 함수 추가\nfunction flashremoveCallback', '')
FinNews.append((trainingSetFin[i][0],trainingSetFin[i][1]))
for i in range(len(trainingSetWorld)):
trainingSetWorld[i][0] = trainingSetWorld[i][0].translate(str.maketrans('', '', punctuation))
trainingSetWorld[i][0] = trainingSetWorld[i][0].replace('flash 오류를 우회하기 위한 함수 추가\nfunction flashremoveCallback', '')
WorldNews.append((trainingSetWorld[i][0],trainingSetWorld[i][1]))
for i in range(len(trainingSetLife)):
trainingSetLife[i][0] = trainingSetLife[i][0].translate(str.maketrans('', '', punctuation))
trainingSetLife[i][0] = trainingSetLife[i][0].replace('flash 오류를 우회하기 위한 함수 추가\nfunction flashremoveCallback', '')
LifeNews.append((trainingSetLife[i][0],trainingSetLife[i][1]))
for i in range(len(trainingSetIT)):
trainingSetIT[i][0] = trainingSetIT[i][0].translate(str.maketrans('', '', punctuation))
trainingSetIT[i][0] = trainingSetIT[i][0].replace('flash 오류를 우회하기 위한 함수 추가\nfunction flashremoveCallback', '')
ITNews.append((trainingSetIT[i][0],trainingSetIT[i][1]))
In [676]:
# 총 125개니까 25개씩 5-folds
trainAll = PolNews + SocNews + FinNews + WorldNews + LifeNews + ITNews
len(trainAll)
Out[676]:
In [677]:
trainAll = list(enumerate(trainAll))
In [760]:
# random으로 data 섞기
from random import shuffle
for i in range(0,1):
shuffle(trainAll)
print(len(trainAll))
print(trainAll[4])
In [719]:
# 5-folds
fold1 = train[:25]
fold2 = train[25:50]
fold3 = train[50:75]
fold4 = train[75:100]
fold5 = train[100:125]
In [679]:
# 5-folds
fold1 = trainAll[:37]
fold2 = trainAll[37:74]
fold3 = trainAll[74:111]
fold4 = trainAll[111:148]
fold5 = trainAll[161:185]
In [720]:
# 1 fold
X_val = fold1
X_train = fold2+fold3+fold4+fold5
In [681]:
# 2 fold
X_val = fold2
X_train = fold1+fold3+fold4+fold5
In [703]:
# 3 fold
X_val = fold3
X_train = fold1+fold2+fold4+fold5
In [683]:
# 4 fold
X_val = fold4
X_train = fold1+fold2+fold3+fold5
In [761]:
# 5 fold
X_val = fold5
X_train = fold1+fold2+fold3+fold4
In [762]:
trainingSet = X_train
In [763]:
# 정치
trainingSetPol = [d for d in trainingSet if d[1][1] == "정치"]
# 사회
trainingSetSoc = [d for d in trainingSet if d[1][1] == "사회"]
# 경제
trainingSetFin = [d for d in trainingSet if d[1][1] == "경제"]
# 세계
trainingSetWorld = [d for d in trainingSet if d[1][1] == "세계"]
# 생활문화
trainingSetLife = [d for d in trainingSet if d[1][1] == "생활문화"]
# IT과학
trainingSetIT = [d for d in trainingSet if d[1][1] == "IT과학"]
In [764]:
# 사전 정보 구하기 prior[c]
# MLE : 주어진 데이터를 이용해 확률값 추정
priorPol = len(PolNews) / N
priorSoc = len(SocNews) / N
priorFin = len(FinNews) / N
priorWorld = len(WorldNews) / N
priorLife = len(LifeNews) / N
priorIT = len(ITNews) / N
In [765]:
# 확률 누적시키기
from collections import defaultdict
tokensPol = defaultdict(int)
tokensSoc = defaultdict(int)
tokensFin = defaultdict(int)
tokensWorld = defaultdict(int)
tokensLife = defaultdict(int)
tokensIT = defaultdict(int)
for d in PolNews:
for t in d[0].split():
tokensPol[t] += 1
for d in SocNews:
for t in d[0].split():
tokensSoc[t] += 1
for d in FinNews:
for t in d[0].split():
tokensFin[t] += 1
for d in WorldNews:
for t in d[0].split():
tokensWorld[t] += 1
for d in LifeNews:
for t in d[0].split():
tokensLife[t] += 1
for d in ITNews:
for t in d[0].split():
tokensIT[t] += 1
vocabulary = set(list(tokensPol.keys()) + list(tokensSoc.keys()) + list(tokensFin.keys()) + list(tokensWorld.keys()) + list(tokensLife.keys()) + list(tokensIT.keys()))
In [766]:
# 조건부 확률
sumPol = sum(tokensPol.values())
sumSoc = sum(tokensSoc.values())
sumFin = sum(tokensFin.values())
sumWorld = sum(tokensWorld.values())
sumLife = sum(tokensLife.values())
sumIT = sum(tokensIT.values())
In [767]:
from math import log
cpPol = defaultdict(float)
cpSoc = defaultdict(float)
cpFin = defaultdict(float)
cpWorld = defaultdict(float)
cpLife = defaultdict(float)
cpIT = defaultdict(float)
B = len(vocabulary)
for t in vocabulary:
cpPol[t] = log((tokensPol[t]+1) / (sumPol + B)) # 1 더해서 smoothing / 분모에 1씩더하는 것은 B를 더하면 됨
cpSoc[t] = log((tokensSoc[t]+1) / (sumSoc + B))
cpFin[t] = log((tokensFin[t]+1) / (sumFin + B))
cpWorld[t] = log((tokensWorld[t]+1) / (sumWorld + B))
cpLife[t] = log((tokensLife[t]+1) / (sumLife + B))
cpIT[t] = log((tokensIT[t]+1) / (sumIT + B))
In [768]:
B
Out[768]:
In [769]:
from math import log, exp
from sklearn.metrics import confusion_matrix, classification_report
prob = 0
xlist = list()
trueClass = list()
predClass = list()
for d in X_val:
scorePol = log(priorPol)
scoreSoc = log(priorSoc)
scoreFin = log(priorFin)
scoreWorld = log(priorWorld)
scoreLife = log(priorLife)
scoreIT = log(priorIT)
for t in d[1][0].split():
# 원래는 곱셈인데 로그 취해서 + 가능
scorePol += cpPol[t]
scoreSoc += cpSoc[t]
scoreFin += cpFin[t]
scoreWorld += cpWorld[t]
scoreLife += cpLife[t]
scoreIT += cpIT[t]
# print(scorePol)
AddScoreList = dict({"정치":scorePol,"사회":scoreSoc, "경제": scoreFin, "세계":scoreWorld, "생활문화":scoreLife, "IT과학":scoreIT})
# print(scorePol, scoreSoc, scoreFin, scoreWorld, scoreLife, scoreIT) # 로그 상태로 지수 취해서 원래대로 나타냄
key_max = max(AddScoreList.keys(), key=(lambda k: AddScoreList[k]))
# print("real : {0} , pred : {1}".format(d[1][1],key_max))
trueClass.append(d[1][1])
predClass.append(key_max)
if d[1][1] == key_max:
prob += 1
else:
xlist.append([d[1][1],key_max])
acc = prob / len(X_val)
print("맞춘 비율 : {0}".format(acc))
conf = confusion_matrix(trueClass,predClass)
print(conf)
print(classification_report(trueClass,predClass))
# 빈도 수를 따라 training보고 판단했을 것.
# testSet의 chinese를 하나로 줄이면 No를 반환함
In [770]:
fold1_acc = 0.96
fold2_acc = 1
fold3_acc = 0.92
fold4_acc = 1
fold5_acc = 0.92
In [771]:
import os
corpus2 = list()
for file in os.listdir("./Testing_Data"):
for cat in catList:
if file.startswith(cat):
with open("./Testing_Data/" + file, encoding='utf-8') as f:
corpus2.append([f.read(),cat])
In [772]:
corpus2[0]
Out[772]:
In [773]:
from math import log
cpPol = defaultdict(float)
cpSoc = defaultdict(float)
cpFin = defaultdict(float)
cpWorld = defaultdict(float)
cpLife = defaultdict(float)
cpIT = defaultdict(float)
B = len(vocabulary)
for t in vocabulary:
cpPol[t] = log((tokensPol[t]+1) / (sumPol + B)) # 1 더해서 smoothing / 분모에 1씩더하는 것은 B를 더하면 됨
cpSoc[t] = log((tokensSoc[t]+1) / (sumSoc + B))
cpFin[t] = log((tokensFin[t]+1) / (sumFin + B))
cpWorld[t] = log((tokensWorld[t]+1) / (sumWorld + B))
cpLife[t] = log((tokensLife[t]+1) / (sumLife + B))
cpIT[t] = log((tokensIT[t]+1) / (sumIT + B))
In [774]:
from math import log, exp
from sklearn.metrics import confusion_matrix, classification_report
prob = 0
xlist = list()
trueClass = list()
predClass = list()
for d in corpus2:
scorePol = log(priorPol)
scoreSoc = log(priorSoc)
scoreFin = log(priorFin)
scoreWorld = log(priorWorld)
scoreLife = log(priorLife)
scoreIT = log(priorIT)
for t in d[0].split():
# 원래는 곱셈인데 로그 취해서 + 가능
scorePol += cpPol[t]
scoreSoc += cpSoc[t]
scoreFin += cpFin[t]
scoreWorld += cpWorld[t]
scoreLife += cpLife[t]
scoreIT += cpIT[t]
# print(scorePol)
AddScoreList = dict({"정치":scorePol,"사회":scoreSoc, "경제": scoreFin, "세계":scoreWorld, "생활문화":scoreLife, "IT과학":scoreIT})
# print(scorePol, scoreSoc, scoreFin, scoreWorld, scoreLife, scoreIT) # 로그 상태로 지수 취해서 원래대로 나타냄
key_max = max(AddScoreList.keys(), key=(lambda k: AddScoreList[k]))
# print("real : {0} , pred : {1}".format(d[1][1],key_max))
trueClass.append(d[1])
predClass.append(key_max)
if d[1] == key_max:
prob += 1
else:
xlist.append([d[1],key_max])
acc = prob / len(X_val)
# print("맞춘 비율 : {0}".format(acc))
conf = confusion_matrix(trueClass,predClass)
print(conf)
print(classification_report(trueClass,predClass))
# 빈도 수를 따라 training보고 판단했을 것.
# testSet의 chinese를 하나로 줄이면 No를 반환함
In [775]:
from math import log
cpPol = defaultdict(float)
cpSoc = defaultdict(float)
cpFin = defaultdict(float)
cpWorld = defaultdict(float)
cpLife = defaultdict(float)
cpIT = defaultdict(float)
K = 0.5
for t in vocabulary:
cpPol[t] = log((tokensPol[t]+K) / (sumPol + K*2)) # 1 더해서 smoothing / 분모에 1씩더하는 것은 B를 더하면 됨
cpSoc[t] = log((tokensSoc[t]+K) / (sumSoc + K*2))
cpFin[t] = log((tokensFin[t]+K) / (sumFin + K*2))
cpWorld[t] = log((tokensWorld[t]+K) / (sumWorld + K*2))
cpLife[t] = log((tokensLife[t]+K) / (sumLife + K*2))
cpIT[t] = log((tokensIT[t]+K) / (sumIT + K*2))
In [776]:
from math import log, exp
from sklearn.metrics import confusion_matrix, classification_report
prob = 0
xlist = list()
trueClass = list()
predClass = list()
for d in corpus2:
scorePol = log(priorPol)
scoreSoc = log(priorSoc)
scoreFin = log(priorFin)
scoreWorld = log(priorWorld)
scoreLife = log(priorLife)
scoreIT = log(priorIT)
for t in d[0].split():
# 원래는 곱셈인데 로그 취해서 + 가능
scorePol += cpPol[t]
scoreSoc += cpSoc[t]
scoreFin += cpFin[t]
scoreWorld += cpWorld[t]
scoreLife += cpLife[t]
scoreIT += cpIT[t]
# print(scorePol)
LapScoreList = dict({"정치":scorePol,"사회":scoreSoc, "경제": scoreFin, "세계":scoreWorld, "생활문화":scoreLife, "IT과학":scoreIT})
# print(scorePol, scoreSoc, scoreFin, scoreWorld, scoreLife, scoreIT) # 로그 상태로 지수 취해서 원래대로 나타냄
key_max = max(LapScoreList.keys(), key=(lambda k: LapScoreList[k]))
# print("real : {0} , pred : {1}".format(d[1][1],key_max))
trueClass.append(d[1])
predClass.append(key_max)
if d[1] == key_max:
prob += 1
else:
xlist.append([d[1],key_max])
acc = prob / len(X_val)
# print("맞춘 비율 : {0}".format(acc))
conf = confusion_matrix(trueClass,predClass)
print(conf)
print(classification_report(trueClass,predClass))
# 빈도 수를 따라 training보고 판단했을 것.
# testSet의 chinese를 하나로 줄이면 No를 반환함
728x90
반응형