728x90
반응형
In [109]:
import numpy as np
words = ["I","like","enjoy","deep","learning","NLP","flying","."]
X = np.array([[0,2,1,0,0,0,0,0],
[2,0,0,1,0,1,0,0],
[1,0,0,0,0,0,1,0],
[0,1,0,0,1,0,0,0],
[0,0,0,1,0,0,0,1],
[0,1,0,0,0,0,0,1],
[0,0,1,0,0,0,0,1],
[0,0,0,0,1,1,1,0]])
U,Sigma,V = np.linalg.svd(X,full_matrices=False)
In [110]:
Sigma
Out[110]:
In [111]:
# 2차원으로 2x2 행렬의 연산을 위해서
_Sigma = np.diag(Sigma[:2])
_Sigma
Out[111]:
In [112]:
# 각각의 단어에 대한 분산표현
US = U[:,:2].dot(_Sigma)
SV = _Sigma.dot(V[:2])
In [113]:
US
Out[113]:
In [114]:
for pair,w in zip(US , words):
print(w,pair)
In [115]:
query = US[words.index("NLP")]
query
Out[115]:
In [116]:
US.shape, query.shape # 8 * 2
Out[116]:
In [117]:
# US랑 빼야함
result = np.linalg.norm(US - np.repeat(query,len(words)).reshape(2,-1).T,axis=1)
In [118]:
# distance여서 0에 가까울수록 좋음
for dist,w in zip(result , words):
print(w,dist)
In [119]:
# a*b / |a| * |b|
qNorm = np.linalg.norm(query)
mNorm = np.linalg.norm(US, axis=1)
In [120]:
# 1 ,8 ro
qNorm, mNorm
Out[120]:
In [121]:
result = US.dot(query) / (qNorm*mNorm)
In [122]:
# cosine이어서 1에 가까울 수록 좋다.
for dist,w in zip(result , words):
print(w,dist)
In [286]:
documents = [
"king is a strong man",
"queen is a wise woman",
"boy is a young man",
"girl is a young man",
"prince is a young king",
"princess is a young queen",
"man is strong",
"woman is pretty",
"prince is a boy will be king",
"princess is a girl will be queen"
]
stopwords = ["is","a","will","be"]
In [287]:
vocabulrary = list()
_documents = list()
for document in documents:
termList = list()
for t in document.lower().split():
if t not in stopwords:
termList.append(t)
vocabulrary.append(t)
_documents.append(termList)
vocabulrary = list(set(vocabulrary))
In [288]:
V = len(vocabulrary)
X = np.zeros((V,V))
In [289]:
X.shape
Out[289]:
In [290]:
K = 1
for document in _documents:
for v in range(len(document)-K):
i = vocabulrary.index(document[v])
j = vocabulrary.index(document[v+1])
X[i][j] += 1
X[j][i] += 1
In [291]:
import pandas as pd
pd.DataFrame(X,index=vocabulrary,columns=vocabulrary)
Out[291]:
In [292]:
U,Sigma,V = np.linalg.svd(X,full_matrices=False)
In [293]:
# 각각의 단어에 대한 분산표현
US = U[:,:2].dot(_Sigma)
SV = _Sigma.dot(V[:2])
In [294]:
for pair,w in zip(US , vocabulrary):
print(w,pair)
In [295]:
queen = US[vocabulrary.index("queen")]
king = US[vocabulrary.index("king")]
girl = US[vocabulrary.index("girl")]
query = queen - king + girl
In [296]:
strong = US[vocabulrary.index("strong")]
king = US[vocabulrary.index("king")]
man = US[vocabulrary.index("man")]
query = king - strong - man
In [297]:
# US랑 빼야함
result = np.linalg.norm(US - np.repeat(query,len(vocabulrary)).reshape(2,-1).T,axis=1)
In [298]:
# distance여서 0에 가까울수록 좋음
for dist,w in zip(result , vocabulrary):
print(w,dist)
In [299]:
import matplotlib.pyplot as plt
plt.figure(figsize=(9,9))
for pair,w in zip(US , vocabulrary):
plt.text(pair[0],pair[1],w)
plt.xlim(-2.0,0.5)
plt.ylim(-2,1.7)
plt.show()
In [300]:
import tensorflow as tf
In [358]:
# One-hot vector 만들기
X = np.ones(len(vocabulrary))
X = np.diag(X)
X
Out[358]:
In [359]:
# 뒤에 [0]은 중괄호 풀어주기 위함
vocabulrary[np.argwhere(X[0])[0][0]]
Out[359]:
In [360]:
word2ind = lambda x:vocabulrary.index(x)
idx2word = lambda x:vocabulrary[x]
In [361]:
X[word2ind("queen")]
Out[361]:
In [362]:
# WINDOW=1
# (strong) king
# (king,man) strong
# (strong) man
In [363]:
def generatePair(D, size):
pairList = list()
limit = len(D)
for i, term in enumerate(D):
start = i - size
end = i + size + 1
for j in range(start,end):
if -1 < j < limit and i != j:
pairList.append((D[j],term))
return pairList
In [364]:
########
WINDOW = 1
inputList = list()
outputList = list()
for document in _documents:
for pair in generatePair(document,WINDOW):
inputList.append(pair[0]) # dㅣ부분 수정
outputList.append(pair[1])
In [365]:
for t1,t2 in zip(inputList,outputList):
print(t1,t2)
In [366]:
len(inputList), len(outputList)
Out[366]:
In [367]:
_documents[0]
Out[367]:
In [375]:
WINDOW = 2
inputList = list()
outputList = list()
for document in _documents:
for pair in generatePair(document,WINDOW):
inputList.append(X[word2ind(pair[0])]) # 이 부분 수정
outputList.append(X[word2ind(pair[1])])
In [391]:
for t1,t2 in zip(inputList,outputList):
print(t1,t2)
In [392]:
inputVector = np.array(inputList)
outputVector = np.array(outputList)
In [393]:
inputVector.shape, outputVector.shape
Out[393]:
In [436]:
V = len(vocabulrary)
DIM = 2
lr = 0.1
inputLayer = tf.placeholder(tf.float32, shape = (None,V))
outputLayer = tf.placeholder(tf.float32, shape = (None,V))
weight1 = tf.Variable(tf.random_normal([V,DIM]))
bias1 = tf.Variable(tf.random_normal([DIM]))
weight2 = tf.Variable(tf.random_normal([DIM,V]))
bias2 = tf.Variable(tf.random_normal([V]))
layer1 = tf.add(tf.matmul(inputLayer,weight1) , bias1)
layer2 = tf.nn.softmax(tf.add(tf.matmul(layer1,weight2),bias2))
loss = tf.reduce_mean(-tf.reduce_sum(outputLayer * tf.log(layer2),axis=[1]))
optimizer = tf.train.GradientDescentOptimizer(learning_rate=lr).minimize(loss)
In [437]:
inputVector.shape
Out[437]:
In [438]:
iteration = 50000
# 52,12
sess = tf.Session()
sess.run(tf.global_variables_initializer())
for i in range(iteration):
sess.run(optimizer,feed_dict={inputLayer:inputVector,outputLayer:outputVector})
if i % 1000 == 0:
print("{0}-{1}".format(i, sess.run(loss, feed_dict={inputLayer:inputVector,outputLayer:outputVector})))
In [439]:
denseVector = sess.run(weight1 + bias1)
In [440]:
# 의미
for t,v in zip(vocabulrary,denseVector):
print(t,v)
In [441]:
denseVector = sess.run(weight2).T
In [442]:
# 의미
for t,v in zip(vocabulrary,denseVector):
print(t,v)
In [443]:
import matplotlib.pyplot as plt
for w,v in zip(vocabulrary,denseVector):
plt.text(v[0],v[1],w)
plt.xlim(-3.6,0.7)
plt.ylim(-2.4,3.6)
plt.show()
# 잘 뭉쳐있는 것을 확인할 수 있다.
In [413]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import *
# Embbed layer가 따로 있음
ANN = Sequential()
ANN.add(Dense(DIM,input_dim=V))
ANN.add(Dense(V, activation='softmax'))
ANN.compile(loss='categorical_crossentropy',optimizer='adadelta',metrics=['accuracy'])
In [414]:
ANN.summary()
In [422]:
history = ANN.fit(inputVector,outputVector, epochs=10000,batch_size=200)
# loss가 지속적으로 떨어지고 있다는 게 중요한 것
In [423]:
plt.plot(history.history['acc'])
plt.plot(history.history['loss'])
plt.show()
In [424]:
denseVector2 = ANN.get_weights()[0]
In [425]:
# 의미
for t,v in zip(vocabulrary,denseVector2):
print(t,v)
In [426]:
def distance(x1,x2):
return np.linalg.norm(x1-x2)
# return np.sqrt(np.sum((x1-x2) **2))
def angle(x1,x2):
return x1.dot(x2) / (np.linalg.norm(x1) * np.linalg.norm(x2))
In [433]:
# result = denseVector2.T
result = denseVector2
query = result[word2ind('queen')]
# query = result[word2ind('queen')] - result[word2ind('princess')] + result[word2ind('king')]
for i in range(len(result)):
print(vocabulrary[i], distance(query, result[i]), angle(query,result[i]))
In [446]:
import matplotlib.pyplot as plt
for w,v in zip(vocabulrary,denseVector2):
plt.text(v[0],v[1],w)
plt.xlim(-3.6,0.7)
plt.ylim(-2.4,3.6)
plt.show()
In [ ]:
728x90
반응형