728x90
반응형
In [73]:
# K-Means Clustering 만들기
# K, centroid, data= random
import random
# examples
data = list()
# centroids
cluster = list()
K = 3
# data length
N = 100
In [74]:
for i in range(N):
data.append((random.randrange(0,N), random.randrange(0,N))) # 0~N까지 임의의 1개 int
for i in range(K):
cluster.append((random.randrange(0,N), random.randrange(0,N))) # 0~N까지 임의의 1개 int
In [147]:
# data
In [76]:
cluster
Out[76]:
In [77]:
import matplotlib.pyplot as plt
plt.figure(figsize=(5,5))
colorMap = ('r','g','b','k')
# datapoint
for i in range(N):
plt.scatter(data[i][0],data[i][1],alpha=0.3,color=colorMap[-1])
# centroid
for i in range(K):
plt.scatter(cluster[i][0],cluster[i][1],color=colorMap[i])
plt.show()
In [78]:
# 이제 EM만 진행하면 됨.
from math import sqrt
# 두 점 사이의 거리재기
def euclidean(x,y):
return sqrt((x[0]-y[0])**2 + (x[1]-y[1])**2)
In [79]:
def expectation(data, clusters):
distance = list()
for i in range(K):
distance.append(euclidean(data,clusters[i]))
return distance.index(min(distance))
In [80]:
# Randomly K=3, N=100, centroids(=cluster) => rnk 만들기
# [0,0,0] * 100 의 형태
rnk = list(list(0 for _ in range(K)) for _ in range(N))
for i in range(N):
k = expectation(data[i],cluster)
rnk[i][k] = 1
# 해당 index에 1 채움
for i in range(K):
dataset = [data[j] for j in range(N) if rnk[j][i]]
print(len(dataset))
In [81]:
def maximization(data):
N = len(data)
sumX = 0
sumY = 0
for i in data:
sumX += i[0]
sumY += i[1]
return (sumX/N, sumY/N) # x,y 좌표를 평균낸 것
In [82]:
for i in range(K):
dataset = [data[j] for j in range(N) if rnk[j][i]]
cluster[i] = maximization(dataset)
In [83]:
cluster
# 이거로 다시 EM돌리는 것 반복
Out[83]:
In [84]:
def sse(data,centroid): # sum squared error
_sum = 0.0
for _ in data:
_sum += euclidean(_,centroid)
return _sum
In [103]:
######## 다시
from random import sample
cluster = sample(data,K) # 3 포인트만 반환함
In [104]:
# iteration 돌기
iterCount = 10
sseList = list()
for _ in range(iterCount):
rnk = list(list(0 for _ in range(K)) for _ in range(N))
for i in range(N):
k = expectation(data[i],cluster)
rnk[i][k] = 1
# 해당 index에 1 채움
_sum = 0.0
oldCluster = list()
for i in range(K):
dataset = [data[j] for j in range(N) if rnk[j][i]]
_sum += sse(dataset,cluster[i])
oldCluster.append(cluster[i])
cluster[i] = maximization(dataset)
print("Iteration:{0} / SSE:{1}".format(_+1,_sum))
for i in range(N):
plt.scatter(data[i][0],data[i][1],alpha=0.3,color=colorMap[rnk[i].index(max(rnk[i]))]) # 1을 가지는 값의 Index를 찾아낸 것. ( numpy argmax써도 됨 )
# centroid
for i in range(K):
plt.plot((oldCluster[i][0],cluster[i][0]),
(oldCluster[i][1],cluster[i][1]),'r-')
plt.scatter(cluster[i][0],cluster[i][1],color=colorMap[i], edgecolors=colorMap[-1])
plt.show()
sseList.append(_sum)
In [105]:
sseList
# 떨어지다가 어느 지점에서 수렴한다.
Out[105]:
In [106]:
# 시각화
plt.plot(range(1,iterCount+1),sseList,'r-')
plt.show()
In [107]:
import matplotlib.pyplot as plt
plt.figure(figsize=(5,5))
colorMap = ('r','g','b','k')
# datapoint
for i in range(N):
plt.scatter(data[i][0],data[i][1],alpha=0.3,color=colorMap[rnk[i].index(max(rnk[i]))]) # 1을 가지는 값의 Index를 찾아낸 것. ( numpy argmax써도 됨 )
# centroid
for i in range(K):
plt.scatter(cluster[i][0],cluster[i][1],color=colorMap[i], edgecolors=colorMap[-1])
plt.show()
In [144]:
# iteration 돌기
from random import sample
iterCount = 10
sseList = list()
colorMap = ('r','g','b','c','m','y','C0','C1','C3','C4','C5','k')
for K in range(2,10):
cluster = sample(data,K)
print("K:{0}".format(K))
for _ in range(iterCount):
rnk = list(list(0 for _ in range(K)) for _ in range(N))
for i in range(N):
j = expectation(data[i],cluster)
rnk[i][j] = 1
# 해당 index에 1 채움
_sum = 0.0
oldCluster = list()
for i in range(K):
dataset = [data[j] for j in range(N) if rnk[j][i]]
_sum += sse(dataset,cluster[i])
oldCluster.append(cluster[i])
cluster[i] = maximization(dataset)
sseList.append(_sum)
print("Iteration:{0} / SSE:{1}".format(_+1,_sum))
for i in range(N):
plt.scatter(data[i][0],data[i][1],alpha=0.3,color=colorMap[rnk[i].index(max(rnk[i]))]) # 1을 가지는 값의 Index를 찾아낸 것. ( numpy argmax써도 됨 )
# centroid
for i in range(K):
plt.plot((oldCluster[i][0],cluster[i][0]),
(oldCluster[i][1],cluster[i][1]),'r-')
plt.scatter(cluster[i][0],cluster[i][1],color=colorMap[i], edgecolors=colorMap[-1])
plt.show()
In [132]:
def cosine(x,y):
return (x[0]*y[0] + x[1]*y[1]) / (euclidean(x,(0,0)) * euclidean(y, (0,0)))
In [138]:
def expectation(data, clusters,cos=True):
distance = list()
metric = cosine if cos else euclidean
rvalue = max if cos else min
for i in range(K):
distance.append(metric(data,clusters[i]))
return distance.index(rvalue(distance)) # cosine이어서 Min이 아니라 Max / euclidean : min
In [146]:
# iteration 돌기
from random import sample
iterCount = 10
sseList = list()
colorMap = ('r','g','b','c','m','y','C0','C1','C3','C4','C5','k')
for K in range(2,10):
cluster = sample(data,K)
print("K:{0}".format(K))
for _ in range(iterCount):
rnk = list(list(0 for _ in range(K)) for _ in range(N))
for i in range(N):
j = expectation(data[i],cluster,cos=True)
rnk[i][j] = 1
# 해당 index에 1 채움
_sum = 0.0
oldCluster = list()
for i in range(K):
dataset = [data[j] for j in range(N) if rnk[j][i]]
_sum += sse(dataset,cluster[i])
oldCluster.append(cluster[i])
cluster[i] = maximization(dataset)
sseList.append(_sum)
print("Iteration:{0} / SSE:{1}".format(_+1,_sum))
for i in range(N):
plt.scatter(data[i][0],data[i][1],alpha=0.3,color=colorMap[rnk[i].index(max(rnk[i]))]) # 1을 가지는 값의 Index를 찾아낸 것. ( numpy argmax써도 됨 )
# centroid
for i in range(K):
plt.plot((oldCluster[i][0],cluster[i][0]),
(oldCluster[i][1],cluster[i][1]),'r-')
plt.scatter(cluster[i][0],cluster[i][1],color=colorMap[i], edgecolors=colorMap[-1])
plt.show()
In [150]:
documents = [
"This little kitty came to play when I was eating at a restaurant.",
"Merley has the best squooshy kitten belly.",
"Google Translate app is incredivle.",
"If you open 100 tab in google you get a smiley face.",
"Best cat photo I've ever taken.",
"Climbing ninja cat.",
"Impressed with google map feedback.",
"Key promoter extension for Google Chrome."
]
In [153]:
# 다차원에 이용하기 위해서 변경
def euclidean(x,y):
_sum = 0.0
for i in range(len(x)):
_sum += (x[i]-y[i]) ** 2
return sqrt(_sum)
In [157]:
def cosine(x,y):
_sum = 0.0
xLength =0.0
yLength = 0.0
for i in range(len(x)):
_sum += x[i]*y[i]
return _sum / (euclidean(x,[0 for _ in range(len(x))]) * euclidean(y,[0 for _ in range(len(x))]))
In [161]:
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from string import punctuation
import re
from collections import defaultdict
In [187]:
# DTM 만들기
# key : doc / val (Term:Freq)
DTM = defaultdict(lambda: defaultdict(int))
# key : Term / val (Doc:Freq)
TDM = defaultdict(lambda: defaultdict(int))
# key : Term / val (Doc:Weight)
TWM = defaultdict(lambda: defaultdict(float))
stopList = stopwords.words('english')
In [190]:
# i번째 doucument 내용 d
for i,d in enumerate(documents):
for t in word_tokenize(d.lower()): # 소문자로 변환된 구두점도 잘린 어절 리스트
if t not in stopList and not re.match(r"[{0}]".format(re.escape(punctuation)), t):
DTM[i][t] += 1
for d, termList in DTM.items():
for t,f in termList.items():
TDM[t][d] = f
N = len(DTM)
V = len(TDM)
# TF-IDF
# TF : f / maxF
# IDF : N / df ( docList의 갯수)
from math import log
for t, docList in TDM.items():
df = len(docList)
for d,f in docList.items():
maxtf = max(DTM[d].values())
TWM[t][d] = (f/maxtf) * (log(N/df))
In [195]:
# 가중치 리스트
TWM
Out[195]:
In [196]:
K = 2
# 여기서부터는 내일 ~
728x90
반응형