728x90
반응형
In [1]:
# 데이터 로드
from sklearn.datasets import fetch_openml
mnist = fetch_openml('mnist_784',version=1)
In [2]:
# train, test split
X_train = mnist['data'][:60000]
y_train = mnist['target'][:60000]
X_test = mnist['data'][60000:]
y_test = mnist['target'][60000:]
In [3]:
from sklearn.ensemble import RandomForestClassifier
import time
rnd_clf = RandomForestClassifier(n_estimators=10,random_state=42)
t0 = time.time()
rnd_clf.fit(X_train,y_train)
t1 = time.time()
In [5]:
print("훈련시간 {:.2f}s".format(t1-t0))
In [6]:
# 평가
from sklearn.metrics import accuracy_score
y_pred = rnd_clf.predict(X_test)
accuracy_score(y_test,y_pred)
Out[6]:
In [13]:
# PCA 적용 95%
from sklearn.decomposition import PCA
pca = PCA(n_components=0.95)
X_train_reduced = pca.fit_transform(X_train)
In [14]:
# PCA 이후 훈련
rnd_clf2 = RandomForestClassifier(n_estimators=10,random_state=42)
t0 = time.time()
rnd_clf2.fit(X_train_reduced,y_train)
t1 = time.time()
In [15]:
print("훈련시간 {:.2f}s".format(t1-t0))
In [16]:
X_test_reduced = pca.transform(X_test)
y_pred = rnd_clf2.predict(X_test_reduced)
accuracy_score(y_test,y_pred)
Out[16]:
In [17]:
# 소프트맥스 시도
from sklearn.linear_model import LogisticRegression
log_clf = LogisticRegression(multi_class="multinomial", solver="lbfgs", max_iter=2000, random_state=42)
t0 = time.time()
log_clf.fit(X_train,y_train)
t1 = time.time()
In [18]:
print("훈련시간 {:.2f}s".format(t1-t0))
In [19]:
y_pred = log_clf.predict(X_test)
accuracy_score(y_test,y_pred)
Out[19]:
In [20]:
# PCA이후 Softmax
log_clf2 = LogisticRegression(multi_class="multinomial", solver="lbfgs", max_iter=2000, random_state=42)
t0 = time.time()
log_clf2.fit(X_train_reduced,y_train)
t1 = time.time()
In [21]:
print("훈련시간 {:.2f}s".format(t1-t0))
In [22]:
y_pred = log_clf2.predict(X_test_reduced)
accuracy_score(y_test,y_pred)
Out[22]:
In [23]:
# 위에서 이미 불러옴
mnist.target = mnist.target.astype(np.int)
In [25]:
np.random.seed(42)
# 전체 60,000개의 이미지에 차원 축소를 하면 매우 오랜 시간이 걸리므로 10,000개의 이미지만 무작위로 선택하여 사용
m = 10000
idx = np.random.permutation(60000)[:m]
X = mnist['data'][idx]
y = mnist['target'][idx]
In [26]:
# t-SNE로 2D차원을 축소!
from sklearn.manifold import TSNE
tsne = TSNE(n_components=2, random_state=42)
X_reduced = tsne.fit_transform(X)
In [27]:
import matplotlib.pyplot as plt
plt.figure(figsize=(13,10))
plt.scatter(X_reduced[:, 0],X_reduced[:, 1], c=y, cmap='jet' )
plt.axis('off')
plt.colorbar()
plt.show()
In [29]:
# 두 개의 숫자에 대하여
import matplotlib
plt.figure(figsize=(9,9))
cmap = matplotlib.cm.get_cmap("jet")
for digit in (4, 9):
plt.scatter(X_reduced[y == digit, 0], X_reduced[y == digit, 1], c=cmap(digit / 9))
plt.axis('off')
plt.show()
In [30]:
# 두 개의 숫자에 대하여
idx = (y ==4) | (y == 9)
X_subset = X[idx]
y_subset = y[idx]
tsne_subset = TSNE(n_components=2,random_state=42)
X_subset_reduced = tsne_subset.fit_transform(X_subset)
In [31]:
plt.figure(figsize=(9,9))
for digit in (4, 9):
plt.scatter(X_subset_reduced[y_subset == digit, 0], X_subset_reduced[y_subset == digit, 1], c=cmap(digit / 9))
plt.axis('off')
plt.show()
In [34]:
from sklearn.preprocessing import MinMaxScaler
from matplotlib.offsetbox import AnnotationBbox, OffsetImage
def plot_digits(X, y, min_distance=0.05, images=None, figsize=(13, 10)):
# 입력 특성의 스케일을 0에서 1 사이로 만듭니다.
X_normalized = MinMaxScaler().fit_transform(X)
# 그릴 숫자의 좌표 목록을 만듭니다.
# 반복문 아래에서 `if` 문장을 쓰지 않기 위해 시작할 때 이미 그래프가 그려져 있다고 가정합니다.
neighbors = np.array([[10., 10.]])
# 나머지는 이해하기 쉽습니다.
plt.figure(figsize=figsize)
cmap = matplotlib.cm.get_cmap("jet")
digits = np.unique(y)
for digit in digits:
plt.scatter(X_normalized[y == digit, 0], X_normalized[y == digit, 1], c=cmap(digit / 9))
plt.axis("off")
ax = plt.gcf().gca() # 현재 그래프의 축을 가져옵니다.
for index, image_coord in enumerate(X_normalized):
closest_distance = np.linalg.norm(np.array(neighbors) - image_coord, axis=1).min()
if closest_distance > min_distance:
neighbors = np.r_[neighbors, [image_coord]]
if images is None:
plt.text(image_coord[0], image_coord[1], str(int(y[index])),
color=cmap(y[index] / 9), fontdict={"weight": "bold", "size": 16})
else:
image = images[index].reshape(28, 28)
imagebox = AnnotationBbox(OffsetImage(image, cmap="binary"), image_coord)
ax.add_artist(imagebox)
In [35]:
# 색이 입혀진 숫자 출력
plot_digits(X_reduced, y)
In [36]:
# 숫자 이미지 사용
plot_digits(X_reduced, y, images=X, figsize=(35, 25))
In [37]:
plot_digits(X_subset_reduced, y_subset, images=X_subset, figsize=(22, 22))
In [38]:
# PCA
from sklearn.decomposition import PCA
import time
pca = PCA(n_components=2,random_state=42)
t0 = time.time()
X_pca_reduced = pca.fit_transform(X)
t1 = time.time()
print("PCA 시간: {:.1f}s.".format(t1 - t0))
plot_digits(X_pca_reduced, y)
plt.show()
In [39]:
# LLE
from sklearn.manifold import LocallyLinearEmbedding
lle = LocallyLinearEmbedding(n_components=2,random_state=42)
t0 = time.time()
X_lle_reduced = lle.fit_transform(X)
t1 = time.time()
print("LLE 시간: {:.1f}s.".format(t1 - t0))
plot_digits(X_lle_reduced, y)
plt.show()
In [42]:
# 분산 95% PCA 적용후 LLE
from sklearn.pipeline import Pipeline
pca_lle = Pipeline([
('pca',PCA(n_components=0.95,random_state=42)),
('lle',LocallyLinearEmbedding(n_components=2,random_state=42))
])
t0 = time.time()
X_pca_lle_reduced = pca_lle.fit_transform(X)
t1 = time.time()
print("PCA + LLE 시간: {:.1f}s.".format(t1 - t0))
plot_digits(X_pca_lle_reduced, y)
plt.show()
In [44]:
# MDS : 10000개 많아서 2000개만 시도
from sklearn.manifold import MDS
m = 2000
t0 = time.time()
X_mds_reduced = MDS(n_components=2,random_state=42).fit_transform(X[:m])
t1 = time.time()
print("MDS시간: {:.1f}s.".format(t1 - t0))
plot_digits(X_mds_reduced, y[:m])
plt.show()
In [45]:
from sklearn.pipeline import Pipeline
m=2000
pca_mds = Pipeline([
('pca',PCA(n_components=0.95,random_state=42)),
('mds',MDS(n_components=2,random_state=42))
])
t0 = time.time()
X_pca_mds_reduced = pca_mds.fit_transform(X[:m])
t1 = time.time()
print("PCA + MDS시간: {:.1f}s.".format(t1 - t0))
plot_digits(X_pca_mds_reduced, y[:m])
plt.show()
In [48]:
# LDA 시도
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
lda = LinearDiscriminantAnalysis(n_components=2)
t0 = time.time()
X_lda_reduced = lda.fit_transform(X,y)
t1 = time.time()
print("LDA 시간: {:.1f}s.".format(t1 - t0))
plot_digits(X_lda_reduced, y,figsize=(12,12))
plt.show()
In [49]:
pca_tsne = Pipeline([
("pca", PCA(n_components=0.95, random_state=42)),
("tsne", TSNE(n_components=2, random_state=42)),
])
t0 = time.time()
X_pca_tsne_reduced = pca_tsne.fit_transform(X)
t1 = time.time()
print("PCA+t-SNE 시간 {:.1f}s.".format(t1 - t0))
plot_digits(X_pca_tsne_reduced, y)
plt.show()
728x90
반응형