728x90

머신러닝 자연어 처리 기술

Word_Embedding¶

In [109]:

import numpy as np

words = ["I","like","enjoy","deep","learning","NLP","flying","."]
X = np.array([[0,2,1,0,0,0,0,0],
             [2,0,0,1,0,1,0,0],
             [1,0,0,0,0,0,1,0],
             [0,1,0,0,1,0,0,0],
             [0,0,0,1,0,0,0,1],
             [0,1,0,0,0,0,0,1],
             [0,0,1,0,0,0,0,1],
             [0,0,0,0,1,1,1,0]])

U,Sigma,V = np.linalg.svd(X,full_matrices=False)

In [110]:

Sigma

Out[110]:

array([2.75726275, 2.678248  , 1.89221277, 1.61803399, 1.19154564,
       0.94833983, 0.61803399, 0.56999221])

In [111]:

# 2차원으로  2x2 행렬의 연산을 위해서 
_Sigma = np.diag(Sigma[:2])
_Sigma

Out[111]:

array([[2.75726275, 0.        ],
       [0.        , 2.678248  ]])

In [112]:

# 각각의 단어에 대한 분산표현 
US = U[:,:2].dot(_Sigma)
SV = _Sigma.dot(V[:2])

In [113]:

US

Out[113]:

array([[-1.44515015, -1.53425886],
       [-1.63902195,  1.68761941],
       [-0.70661477,  0.73388691],
       [-0.78757738, -0.66397017],
       [-0.53253583,  0.09065737],
       [-0.8413365 , -0.78737543],
       [-0.50317243, -0.4312723 ],
       [-0.68076383,  0.42116725]])

In [114]:

for pair,w in zip(US , words):
        print(w,pair)

I [-1.44515015 -1.53425886]
like [-1.63902195  1.68761941]
enjoy [-0.70661477  0.73388691]
deep [-0.78757738 -0.66397017]
learning [-0.53253583  0.09065737]
NLP [-0.8413365  -0.78737543]
flying [-0.50317243 -0.4312723 ]
. [-0.68076383  0.42116725]

In [115]:

query = US[words.index("NLP")]
query

Out[115]:

array([-0.8413365 , -0.78737543])

In [116]:

US.shape, query.shape  # 8 * 2

Out[116]:

((8, 2), (2,))

In [117]:

# US랑 빼야함 
result = np.linalg.norm(US - np.repeat(query,len(words)).reshape(2,-1).T,axis=1)

In [118]:

# distance여서 0에 가까울수록 좋음
for dist,w in zip(result , words):
        print(w,dist)

I 0.9604300020857373
like 2.600365650423296
enjoy 1.5272161083915692
deep 0.1346064624142891
learning 0.9307520814224417
NLP 0.0
flying 0.4910848961381424
. 1.2191632307652118

Cosine Similarity

In [119]:

# a*b / |a| * |b|

qNorm = np.linalg.norm(query)
mNorm = np.linalg.norm(US, axis=1)

In [120]:

# 1 ,8 ro
qNorm, mNorm

Out[120]:

(1.1523051515781995,
 array([2.10770235, 2.35254165, 1.01877104, 1.03011384, 0.54019735,
        1.15230515, 0.66270528, 0.80051312]))

In [121]:

result = US.dot(query) / (qNorm*mNorm)

In [122]:

# cosine이어서 1에 가까울 수록 좋다. 
for dist,w in zip(result , words):
        print(w,dist)

I 0.9980150045904095
like 0.01851043547439606
enjoy 0.014188411490850409
deep 0.9986570480042436
learning 0.6051040644311393
NLP 1.0
flying 0.9990464464925265
. 0.26141095433016986

SVD Embedding

In [286]:

documents = [
    "king is a strong man",
    "queen is a wise woman",
    "boy is a young man",
    "girl is a young man",
    "prince is a young king",
    "princess is a young queen",
    "man is strong",
    "woman is pretty",
    "prince is a boy will be king",
    "princess is a girl will be queen"
]

stopwords = ["is","a","will","be"]

In [287]:

vocabulrary = list()
_documents = list()

for document in documents:
    termList = list()
    
    for t in document.lower().split():
        if t not in stopwords:
            termList.append(t)
            vocabulrary.append(t)
            
    _documents.append(termList)
vocabulrary = list(set(vocabulrary))

In [288]:

V = len(vocabulrary)
X = np.zeros((V,V))

In [289]:

X.shape

Out[289]:

(12, 12)

In [290]:

K = 1

for document in _documents:
    for v in range(len(document)-K):
        i = vocabulrary.index(document[v])
        j = vocabulrary.index(document[v+1])
        
        X[i][j] += 1
        X[j][i] += 1

In [291]:

import pandas as pd
pd.DataFrame(X,index=vocabulrary,columns=vocabulrary)

Out[291]:

	queen	woman	princess	wise	strong	boy	girl	pretty	man	young	prince	king
queen	0.0	0.0	0.0	1.0	0.0	0.0	1.0	0.0	0.0	1.0	0.0	0.0
woman	0.0	0.0	0.0	1.0	0.0	0.0	0.0	1.0	0.0	0.0	0.0	0.0
princess	0.0	0.0	0.0	0.0	0.0	0.0	1.0	0.0	0.0	1.0	0.0	0.0
wise	1.0	1.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
strong	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	2.0	0.0	0.0	1.0
boy	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	1.0	1.0	1.0
girl	1.0	0.0	1.0	0.0	0.0	0.0	0.0	0.0	0.0	1.0	0.0	0.0
pretty	0.0	1.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
man	0.0	0.0	0.0	0.0	2.0	0.0	0.0	0.0	0.0	2.0	0.0	0.0
young	1.0	0.0	1.0	0.0	0.0	1.0	1.0	0.0	2.0	0.0	1.0	1.0
prince	0.0	0.0	0.0	0.0	0.0	1.0	0.0	0.0	0.0	1.0	0.0	0.0
king	0.0	0.0	0.0	0.0	1.0	1.0	0.0	0.0	0.0	1.0	0.0	0.0

In [292]:

U,Sigma,V = np.linalg.svd(X,full_matrices=False)

In [293]:

# 각각의 단어에 대한 분산표현 
US = U[:,:2].dot(_Sigma)
SV = _Sigma.dot(V[:2])

In [294]:

for pair,w in zip(US , vocabulrary):
        print(w,pair)

queen [-0.62574113  0.42877517]
woman [-0.0447255   0.04312074]
princess [-0.5838126  0.3892648]
wise [-0.16766539 -0.13654591]
strong [-0.80819971 -1.10643747]
boy [-0.75477279  0.11713923]
girl [-0.70898993  0.21467856]
pretty [-0.01118463 -0.01247724]
man [-1.21724451  1.54307526]
young [-1.62558179 -1.55995846]
prince [-0.59526164  0.41748839]
king [-0.79737031  0.73764269]

In [295]:

queen = US[vocabulrary.index("queen")]
king = US[vocabulrary.index("king")]
girl = US[vocabulrary.index("girl")]

query = queen - king + girl

In [296]:

strong = US[vocabulrary.index("strong")]
king = US[vocabulrary.index("king")]
man = US[vocabulrary.index("man")]

query = king - strong - man

In [297]:

# US랑 빼야함 
result = np.linalg.norm(US - np.repeat(query,len(vocabulrary)).reshape(2,-1).T,axis=1)

In [298]:

# distance여서 0에 가까울수록 좋음
for dist,w in zip(result , vocabulrary):
        print(w,dist)

queen 1.8582129775847813
woman 1.2986618485301247
princess 1.8140348818141296
wise 1.462716277798703
strong 2.475339230769569
boy 1.9913532185326235
girl 1.9389864814290207
pretty 1.2782929203807352
man 2.7426849951338035
young 3.4068365795629756
prince 1.8270525327695297
king 2.0719741466195454

In [299]:

import matplotlib.pyplot as plt
plt.figure(figsize=(9,9))

for pair,w in zip(US , vocabulrary):
    plt.text(pair[0],pair[1],w)
plt.xlim(-2.0,0.5)
plt.ylim(-2,1.7)
plt.show()

Vocabulary가 새로 추가되면 다시 SVD를 해야하고, Vocabulary 수가 많아질수록 연산량이 많아져서 힘들다는 단점이 있다.

CBOW ( 연속 BOW )

문맥으로부터 단어를 예측
소규모 데이터 셋에 대하여 성능이 좋음

one-hot vector를 input으로 함

Skip-gram 모델

단어로부터 문맥을 예측

window_size = 2 라면 앞뒤로 2 단어씩 본다는 의미

In [300]:

import tensorflow as tf

In [358]:

# One-hot vector 만들기 
X = np.ones(len(vocabulrary))
X = np.diag(X)
X

Out[358]:

array([[1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1.]])

In [359]:

# 뒤에 [0]은 중괄호 풀어주기 위함
vocabulrary[np.argwhere(X[0])[0][0]]

Out[359]:

'queen'

In [360]:

word2ind = lambda x:vocabulrary.index(x)
idx2word = lambda x:vocabulrary[x]

In [361]:

X[word2ind("queen")]

Out[361]:

array([1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])

In [362]:

# WINDOW=1
# (strong) king
# (king,man) strong
# (strong) man

In [363]:

def generatePair(D, size):
    pairList = list()
    limit = len(D)
            
    for i, term in enumerate(D):
        start = i - size
        end = i + size + 1

        for j in range(start,end):
            if -1 < j < limit and i != j:
                pairList.append((D[j],term))

    return pairList

In [364]:

########
WINDOW = 1

inputList = list()
outputList = list()

for document in _documents:
    for pair in generatePair(document,WINDOW):
        inputList.append(pair[0])  # dㅣ부분 수정 
        outputList.append(pair[1])

In [365]:

for t1,t2 in zip(inputList,outputList):
    print(t1,t2)

strong king
king strong
man strong
strong man
wise queen
queen wise
woman wise
wise woman
young boy
boy young
man young
young man
young girl
girl young
man young
young man
young prince
prince young
king young
young king
young princess
princess young
queen young
young queen
strong man
man strong
pretty woman
woman pretty
boy prince
prince boy
king boy
boy king
girl princess
princess girl
queen girl
girl queen

In [366]:

len(inputList), len(outputList)

Out[366]:

(36, 36)

In [367]:

_documents[0]

Out[367]:

['king', 'strong', 'man']

In [375]:

WINDOW = 2

inputList = list()
outputList = list()

for document in _documents:
    for pair in generatePair(document,WINDOW):
        inputList.append(X[word2ind(pair[0])])  # 이 부분 수정 
        outputList.append(X[word2ind(pair[1])])

In [391]:

for t1,t2 in zip(inputList,outputList):
    print(t1,t2)

[0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0.] [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1.]
[0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0.] [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1.]
[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1.] [0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0.]
[0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0.] [0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0.]
[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1.] [0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0.]
[0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0.] [0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0.]
[0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0.] [1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
[0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.] [1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
[1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.] [0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0.]
[0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.] [0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0.]
[1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.] [0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
[0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0.] [0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
[0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0.] [0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0.]
[0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0.] [0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0.]
[0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0.] [0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0.]
[0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0.] [0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0.]
[0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0.] [0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0.]
[0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0.] [0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0.]
[0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0.] [0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0.]
[0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0.] [0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0.]
[0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0.] [0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0.]
[0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0.] [0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0.]
[0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0.] [0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0.]
[0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0.] [0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0.]
[0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0.] [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0.]
[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1.] [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0.]
[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0.] [0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0.]
[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1.] [0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0.]
[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0.] [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1.]
[0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0.] [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1.]
[0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0.] [0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
[1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.] [0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
[0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0.] [0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0.]
[1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.] [0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0.]
[0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0.] [1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
[0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0.] [1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
[0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0.] [0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0.]
[0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0.] [0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0.]
[0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0.] [0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
[0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.] [0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0.]
[0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0.] [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0.]
[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1.] [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0.]
[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0.] [0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0.]
[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1.] [0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0.]
[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0.] [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1.]
[0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0.] [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1.]
[0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0.] [0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
[1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.] [0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
[0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0.] [0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0.]
[1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.] [0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0.]
[0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0.] [1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
[0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0.] [1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]

In [392]:

inputVector = np.array(inputList)
outputVector = np.array(outputList)

In [393]:

inputVector.shape, outputVector.shape

Out[393]:

((52, 12), (52, 12))

In [436]:

V = len(vocabulrary)
DIM = 2
lr = 0.1

inputLayer = tf.placeholder(tf.float32, shape = (None,V))
outputLayer = tf.placeholder(tf.float32, shape = (None,V))

weight1 = tf.Variable(tf.random_normal([V,DIM]))
bias1 = tf.Variable(tf.random_normal([DIM]))

weight2 = tf.Variable(tf.random_normal([DIM,V]))
bias2 = tf.Variable(tf.random_normal([V]))

layer1 = tf.add(tf.matmul(inputLayer,weight1) , bias1)

layer2 = tf.nn.softmax(tf.add(tf.matmul(layer1,weight2),bias2))

loss = tf.reduce_mean(-tf.reduce_sum(outputLayer * tf.log(layer2),axis=[1]))

optimizer = tf.train.GradientDescentOptimizer(learning_rate=lr).minimize(loss)

In [437]:

inputVector.shape

Out[437]:

(52, 12)

In [438]:

iteration = 50000
# 52,12
sess = tf.Session()
sess.run(tf.global_variables_initializer())

for i in range(iteration):
    sess.run(optimizer,feed_dict={inputLayer:inputVector,outputLayer:outputVector})
    
    if i % 1000 == 0:
        print("{0}-{1}".format(i, sess.run(loss, feed_dict={inputLayer:inputVector,outputLayer:outputVector})))

0-3.667945384979248
1000-1.7845362424850464
2000-1.733593225479126
3000-1.7115153074264526
4000-1.697153091430664
5000-1.6859931945800781
6000-1.6765549182891846
7000-1.6680412292480469
8000-1.6565381288528442
9000-1.6295253038406372
10000-1.614054799079895
11000-1.6085937023162842
12000-1.6052024364471436
13000-1.6025961637496948
14000-1.6004163026809692
15000-1.5985182523727417
16000-1.5968258380889893
17000-1.5952917337417603
18000-1.5938855409622192
19000-1.5925877094268799
20000-1.5913853645324707
21000-1.5902695655822754
22000-1.589233160018921
23000-1.5882703065872192
24000-1.5873749256134033
25000-1.5865414142608643
26000-1.5857638120651245
27000-1.5850363969802856
28000-1.584354281425476
29000-1.5837126970291138
30000-1.583107590675354
31000-1.5825356245040894
32000-1.581993579864502
33000-1.5814791917800903
34000-1.580989956855774
35000-1.5805243253707886
36000-1.5800811052322388
37000-1.579659342765808
38000-1.579259991645813
39000-1.5788838863372803
40000-1.5785319805145264
41000-1.578203797340393
42000-1.5778958797454834
43000-1.5776044130325317
44000-1.5773260593414307
45000-1.5770593881607056
46000-1.5768024921417236
47000-1.5765552520751953
48000-1.5763168334960938
49000-1.5760862827301025

In [439]:

denseVector = sess.run(weight1 + bias1)

In [440]:

# 의미 
for t,v in zip(vocabulrary,denseVector):
    print(t,v)

queen [-1.3560934 -1.1338634]
woman [ 1.7741053 -7.968771 ]
princess [ 0.3360301  -0.13280794]
wise [-0.46626854 -1.7719223 ]
strong [6.22105  2.574712]
boy [-0.08729273  2.847761  ]
girl [0.722275  0.3943334]
pretty [-5.4403872 -2.6272833]
man [-1.3587679  4.5057545]
young [0.8895804 0.6234251]
prince [-0.8449539  4.2878323]
king [-1.2167302  5.045244 ]

In [441]:

denseVector = sess.run(weight2).T

In [442]:

# 의미 
for t,v in zip(vocabulrary,denseVector):
    print(t,v)

queen [ 3.1757262 -3.3555548]
woman [-1.852784  -4.7720513]
princess [-1.657531    0.73468393]
wise [-1.2683164 -5.071105 ]
strong [-4.7982287  2.7737722]
boy [0.5400045 3.0944598]
girl [-2.0258408  1.427884 ]
pretty [ 2.784725 -4.301944]
man [3.0098317 3.7384193]
young [-2.082117   1.7151161]
prince [2.4121315 3.9171267]
king [2.8972592 4.1201396]

In [443]:

import matplotlib.pyplot as plt

for w,v in zip(vocabulrary,denseVector):
    plt.text(v[0],v[1],w)
plt.xlim(-3.6,0.7)
plt.ylim(-2.4,3.6)
plt.show()

# 잘 뭉쳐있는 것을 확인할 수 있다.

In [413]:

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import *

# Embbed layer가 따로 있음
ANN = Sequential()
ANN.add(Dense(DIM,input_dim=V))
ANN.add(Dense(V, activation='softmax'))
ANN.compile(loss='categorical_crossentropy',optimizer='adadelta',metrics=['accuracy'])

In [414]:

ANN.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
=================================================================
dense_4 (Dense)              (None, 2)                 26        
_________________________________________________________________
dense_5 (Dense)              (None, 12)                36        
=================================================================
Total params: 62
Trainable params: 62
Non-trainable params: 0
_________________________________________________________________

In [422]:

history = ANN.fit(inputVector,outputVector, epochs=10000,batch_size=200)
# loss가 지속적으로 떨어지고 있다는 게 중요한 것

Epoch 1/10000
52/52 [==============================] - 0s 56us/sample - loss: 1.6776 - acc: 0.3077
Epoch 2/10000
52/52 [==============================] - 0s 37us/sample - loss: 1.6774 - acc: 0.3077
Epoch 3/10000
52/52 [==============================] - 0s 37us/sample - loss: 1.6773 - acc: 0.3077
Epoch 4/10000
52/52 [==============================] - 0s 40us/sample - loss: 1.6771 - acc: 0.3077
Epoch 5/10000
Epoch 8/10000
52/52 [==============================] - 0s 34us/sample - loss: 1.6765 - acc: 0.3077
Epoch 9/10000
52/52 [==============================] - 0s 38us/sample - loss: 1.6764 - acc: 0.3077
Epoch 10/10000
52/52 [==============================] - 0s 30us/sample - loss: 1.6762 - acc: 0.3077
Epoch 11/10000
52/52 [==============================] - 0s 39us/sample - loss: 1.6761 - acc: 0.3077
Epoch 12/10000
52/52 [==============================] - 0s 31us/sample - loss: 1.6760 - acc: 0.3077
Epoch 13/10000
52/52 [==============================] - 0s 41us/sample - loss: 1.6758 - acc: 0.3077
Epoch 14/10000
52/52 [==============================] - 0s 50us/sample - loss: 1.6757 - acc: 0.3077
Epoch 15/10000
52/52 [==============================] - 0s 42us/sample - loss: 1.6755 - acc: 0.3077
Epoch 16/10000
52/52 [==============================] - 0s 45us/sample - loss: 1.6754 - acc: 0.3077
Epoch 17/10000
52/52 [==============================] - 0s 36us/sample - loss: 1.6752 - acc: 0.3077
Epoch 18/10000
52/52 [==============================] - 0s 32us/sample - loss: 1.6751 - acc: 0.3077


:
:
:
Epoch 9999/10000
52/52 [==============================] - 0s 31us/sample - loss: 1.6076 - acc: 0.3269
Epoch 10000/10000
52/52 [==============================] - 0s 37us/sample - loss: 1.6068 - acc: 0.3269

In [423]:

plt.plot(history.history['acc'])
plt.plot(history.history['loss'])
plt.show()

In [424]:

denseVector2 = ANN.get_weights()[0]

In [425]:

# 의미 
for t,v in zip(vocabulrary,denseVector2):
    print(t,v)

queen [0.31132257 2.1597593 ]
woman [0.9729645 3.163769 ]
princess [ 1.3818107  -0.56394166]
wise [3.2550652 3.8211248]
strong [ 0.06374381 -8.398722  ]
boy [-1.7401673 -1.7778441]
girl [ 1.6003996 -1.823022 ]
pretty [7.185945 8.364365]
man [-4.0047626  -0.41559717]
young [ 1.6273608 -2.280593 ]
prince [-3.8873901 -1.2059666]
king [-4.570071  -1.0653982]

In [426]:

def distance(x1,x2):
    return np.linalg.norm(x1-x2)
#     return np.sqrt(np.sum((x1-x2) **2))
def angle(x1,x2):
    return x1.dot(x2) / (np.linalg.norm(x1) * np.linalg.norm(x2))

In [433]:

# result = denseVector2.T
result = denseVector2

query = result[word2ind('queen')]
# query = result[word2ind('queen')] - result[word2ind('princess')] + result[word2ind('king')]

for i in range(len(result)):
    print(vocabulrary[i], distance(query, result[i]), angle(query,result[i]))

queen 0.0 0.9999999
woman 1.2024165 0.98798174
princess 2.9265153 -0.2419006
wise 3.3802009 0.8459711
strong 10.561383 -0.98865867
boy 4.4399695 -0.8071263
girl 4.1861997 -0.64968884
pretty 9.260537 0.8437303
man 5.0260377 -0.24407558
young 4.631273 -0.72281045
prince 5.381199 -0.42952967
king 5.8506107 -0.3636612

In [446]:

import matplotlib.pyplot as plt

for w,v in zip(vocabulrary,denseVector2):
    plt.text(v[0],v[1],w)
plt.xlim(-3.6,0.7)
plt.ylim(-2.4,3.6)
plt.show()

In [ ]:

728x90

저작자표시 (새창열림)

[NLP] Day 39 - Word_Embedding

머신러닝 자연어 처리 기술

Word_Embedding¶

Cosine Similarity

SVD Embedding

CBOW ( 연속 BOW )

Skip-gram 모델

티스토리툴바