728x90
반응형
In [12]:
from sklearn.datasets import make_moons
X,y = make_moons(n_samples=1000, noise=0.4, random_state=42)
In [13]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=42)
In [14]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV
tree_clf = DecisionTreeClassifier(random_state=42)
params = {'max_leaf_nodes' : list(range(2,100)),'min_samples_split' : [2,3,4]}
grid_search_cv = GridSearchCV(tree_clf,param_grid=params,cv=3,n_jobs=-1,verbose=1)
grid_search_cv.fit(X_train,y_train)
Out[14]:
In [15]:
grid_search_cv.best_estimator_
Out[15]:
In [16]:
from sklearn.metrics import accuracy_score
# 별도의 훈련이 필요없다. GridSearchCV는 최적의 모델로 다시 훈련시키기 때문이다.
y_pred = grid_search_cv.best_estimator_.predict(X_test)
accuracy_score(y_test,y_pred)
Out[16]:
In [26]:
from sklearn.model_selection import ShuffleSplit
n_trees = 1000
n_instances = 100
# 100개의 array
mini_sets = []
rs = ShuffleSplit(n_splits=n_trees, test_size=len(X_train) - n_instances, random_state=42)
for train_index, test_index in rs.split(X_train):
X_mini_train = X_train[train_index]
y_mini_train = y_train[train_index]
mini_sets.append((X_mini_train,y_mini_train))
In [27]:
from sklearn.base import clone
# clone : Constructs a new estimator with the same parameters.
# 동일한 파라미터를 가지고 새로운 추정치를 만들어낸다.
forest = [clone(grid_search_cv.best_estimator_) for _ in range(n_trees)]
acc_scores = []
for tree, (X_mini_train,y_mini_train) in zip(forest,mini_sets):
tree.fit(X_mini_train,y_mini_train)
y_pred = tree.predict(X_test)
acc_scores.append(accuracy_score(y_test,y_pred))
np.mean(acc_scores)
Out[27]:
In [28]:
Y_pred = np.empty([n_trees, len(X_test)], dtype=np.uint8)
for tree_idx, tree in enumerate(forest):
Y_pred[tree_idx] = tree.predict(X_test)
In [29]:
from scipy.stats import mode
y_pred_major_vote,n_votes = mode(Y_pred,axis=0)
In [30]:
accuracy_score(y_test,y_pred_major_vote.reshape([-1]))
Out[30]:
728x90
반응형