import warnings
warnings.filterwarnings(action="ignore")


from sklearn.datasets import fetch_openml
mnist = fetch_openml('mnist_784')
mnist.data.shape, mnist.target.shape

((70000, 784), (70000,))


X, y = mnist.data, mnist.target
print(X.shape, y.shape)

(70000, 784) (70000,)


import matplotlib.pyplot as plt
import matplotlib
%matplotlib inline

some_digits = X[36000]
some_digit_image = some_digits.reshape(28, 28)

plt.imshow(some_digit_image, cmap=matplotlib.cm.binary, interpolation="nearest")
plt.axis("off")
plt.show()


y[36000]

'9'


import numpy as np
y = y.astype(np.int)


X_train, X_test = X[:60000], X[60000:]
y_train, y_test = y[:60000], y[60000:]


shuffle_index = np.random.permutation(60000)
X_train, y_train = X_train[shuffle_index], y_train[shuffle_index]


y_train_5 = (y_train == 5)
y_test_5 = (y_test == 5)


print(y_train_5.shape, y_test_5.shape)

(60000,) (10000,)


from sklearn.linear_model import SGDClassifier

sgd_clf = SGDClassifier(max_iter = 5, random_state=42)
sgd_clf.fit(X_train, y_train_5)

SGDClassifier(alpha=0.0001, average=False, class_weight=None,
              early_stopping=False, epsilon=0.1, eta0=0.0, fit_intercept=True,
              l1_ratio=0.15, learning_rate='optimal', loss='hinge', max_iter=5,
              n_iter_no_change=5, n_jobs=None, penalty='l2', power_t=0.5,
              random_state=42, shuffle=True, tol=0.001, validation_fraction=0.1,
              verbose=0, warm_start=False)


sgd_clf.predict([some_digits])

array([False])


from sklearn.model_selection import cross_val_score

cross_val_score(sgd_clf, X_train, y_train_5, cv=3, scoring='accuracy')

array([0.9611, 0.9567, 0.9612])


from sklearn.base import BaseEstimator

class Never5Classifier(BaseEstimator):
    def fit(self, X, y=None):
        pass
    def predict(self, X):
        return np.zeros((len(X), 1), dtype=bool)


never_5_clf = Never5Classifier()
cross_val_score(never_5_clf, X_train, y_train_5, cv=3, scoring='accuracy')

array([0.9115 , 0.9077 , 0.90975])


from sklearn.model_selection import cross_val_predict

y_train_pred = cross_val_predict(sgd_clf, X_train, y_train_5, cv=3)


from sklearn.metrics import confusion_matrix

confusion_matrix(y_train_5, y_train_pred)

array([[53804,   775],
       [ 1645,  3776]])


from sklearn.metrics import precision_score, recall_score

print(precision_score(y_train_5, y_train_pred))
print(recall_score(y_train_5, y_train_pred))

0.8297077565370248
0.6965504519461354


from sklearn.metrics import f1_score
f1_score(y_train_5, y_train_pred)

0.7573204973926995


y_scores = sgd_clf.decision_function([some_digits])
y_scores

array([-311247.82568518])


threshold = 0
y_some_digit_pred = (y_scores > threshold)
y_some_digit_pred

array([False])


y_scores = cross_val_predict(sgd_clf, X_train, y_train_5, cv=3, method = "decision_function")


from sklearn.metrics import precision_recall_curve

precisions, recalls, thresholds = precision_recall_curve(y_train_5, y_scores)


def plot_precision_reall_vs_threshold(precisions, recalls, thresholds):
    plt.plot(thresholds, precisions[:-1], "b--", label="precision")
    plt.plot(thresholds, recalls[:-1], "g--", label="recall")
    plt.xlabel("threshold")
    plt.ylim([0, 1])
    plt.legend(loc="center left")

plot_precision_reall_vs_threshold(precisions, recalls, thresholds)
plt.show()


from sklearn.metrics import roc_curve

fpr, tpr, thresholds = roc_curve(y_train_5, y_scores)


def plot_roc_curve(fpr, tpr, label=None):
    plt.plot(fpr, tpr, linewidth=2, label=label)
    plt.plot([0, 1], [0, 1], "k--")
    plt.axis([0, 1, 0, 1])
    plt.xlabel("거짓 양성 비율")
    plt.ylabel("진짜 양성 비율")

plot_roc_curve(fpr, tpr)
plt.show()


from sklearn.metrics import roc_auc_score
roc_auc_score(y_train_5, y_scores)

0.9570486413046225


from sklearn.ensemble import RandomForestClassifier

forest_clf = RandomForestClassifier(n_estimators=10, random_state=42)
y_probas_forest = cross_val_predict(forest_clf, X_train, y_train_5, cv=3, method="predict_proba")


y_scores_forest = y_probas_forest[:, 1] # 양성 클래스에 대한 확률을 점수로 사용합니다. 
fpr_forest, tpr_forest, thresholds_forest = roc_curve(y_train_5, y_scores_forest)


plot_roc_curve(fpr_forest, tpr_forest, label="RandomForest")
plt.plot(fpr, tpr, "b:", label="SGD")
plt.legend()
plt.show()


roc_auc_score(y_train_5, y_scores_forest)

0.9923374256972404


sgd_clf.fit(X_train, y_train)
sgd_clf.predict([some_digits])

array([9])


some_digit_score = sgd_clf.decision_function([some_digits])
some_digit_score

array([[-932629.58538505, -188216.61815534, -854115.54854604,
          -4067.36678374,  -31814.75205053, -269902.35748785,
        -732616.21115564, -292140.3639404 , -232521.30861646,
           4823.27137801]])


np.argmax(some_digit_score)

9


sgd_clf.classes_

array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])


sgd_clf.classes_[4]

4


from sklearn.multiclass import OneVsOneClassifier

ovo_clf = OneVsOneClassifier(SGDClassifier(max_iter=5, random_state=42))
ovo_clf.fit(X_train, y_train)
ovo_clf.predict([some_digits])

array([9])


len(ovo_clf.estimators_)

45


forest_clf.fit(X_train, y_train)
forest_clf.predict([some_digits])

array([9])


forest_clf.predict_proba([some_digits])

array([[0. , 0. , 0. , 0. , 0.2, 0.2, 0. , 0. , 0. , 0.6]])


cross_val_score(sgd_clf, X_train, y_train, cv=3, scoring="accuracy")

array([0.88125, 0.8838 , 0.88685])


# RandomForestClassifier도 한번 평가해봅시다!
cross_val_score(forest_clf, X_train, y_train, cv=3, scoring="accuracy")

array([0.94155, 0.93985, 0.94215])


from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train.astype(np.float64))
cross_val_score(sgd_clf, X_train_scaled, y_train, cv=3, scoring="accuracy")

array([0.90775, 0.90885, 0.9128 ])


y_train_pred = cross_val_predict(sgd_clf, X_train_scaled, y_train, cv=3)
conf_mx = confusion_matrix(y_train, y_train_pred)
conf_mx

array([[5740,    3,   23,    8,   12,   45,   43,    7,   38,    4],
       [   2, 6477,   42,   25,    6,   48,    7,   13,  111,   11],
       [  58,   41, 5338,  110,   78,   18,   79,   52,  168,   16],
       [  52,   40,  134, 5331,    2,  253,   33,   53,  135,   98],
       [  25,   25,   37,    7, 5358,   10,   50,   30,   84,  216],
       [  76,   39,   33,  172,   70, 4619,  111,   26,  173,  102],
       [  43,   25,   57,    2,   51,   88, 5600,    7,   44,    1],
       [  31,   20,   71,   32,   58,    9,    5, 5785,   14,  240],
       [  55,  152,   71,  156,   14,  162,   52,   21, 5029,  139],
       [  38,   30,   28,   80,  157,   40,    2,  185,   78, 5311]])


plt.matshow(conf_mx, cmap=plt.cm.gray)
plt.show()


row_sums = conf_mx.sum(axis=1, keepdims=True)
norm_conf_mx = conf_mx / row_sums


np.fill_diagonal(norm_conf_mx, 0)
plt.matshow(norm_conf_mx, cmap=plt.cm.gray)
plt.show()


cl_a, cl_b = 3, 5

X_aa = X_train[(y_train == cl_a) & (y_train_pred == cl_a)]
X_ab = X_train[(y_train == cl_a) & (y_train_pred == cl_b)]
X_ba = X_train[(y_train == cl_b) & (y_train_pred == cl_a)]
X_bb = X_train[(y_train == cl_b) & (y_train_pred == cl_b)]


def plot_digits(lst):
    fig, ax = plt.subplots()
    for i in range(25):
        plt.subplot(5, 5, i+1)
        tmp = lst[i].reshape(28, 28)
        plt.imshow(tmp, cmap=matplotlib.cm.binary, interpolation="nearest")
        plt.axis("off")
    #plt.close()
    return fig


plt.figure(figsize=(12, 8))
plot_digits(X_aa[:25])
plot_digits(X_ab[:25])
plot_digits(X_ba[:25])
plot_digits(X_bb[:25])
plt.show()

<Figure size 864x576 with 0 Axes>


from sklearn.neighbors import KNeighborsClassifier


y_train_large = (y_train >= 7)
y_train_odd = (y_train % 2 == 1)
y_multilabel = np.c_[y_train_large, y_train_odd]

knn_clf = KNeighborsClassifier()
knn_clf.fit(X_train, y_multilabel)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=5, p=2,
                     weights='uniform')


knn_clf.predict([some_digits])

array([[ True,  True]])


#y_train_knn_pred = cross_val_predict(knn_clf, X_train, y_multilabel, cv=3, n_jobs=1)
#f1_score(y_multilabel, y_train_knn_pred, average='macro')


from numpy import random as rnd

noise = rnd.randint(0, 100, (len(X_train), 784))
X_train_mod = X_train + noise
noise = rnd.randint(0, 100,  (len(X_test), 784))
X_test_mod = X_test + noise
y_train_mod = X_train
y_test_mod = X_test


idx = 36000

plt.subplot(2, 1, 1)
plt.imshow(X_train_mod[idx].reshape(28, 28))
plt.subplot(2, 1, 2)
plt.imshow(y_train_mod[idx].reshape(28, 28))
plt.show()


knn_clf.fit(X_train_mod, y_train_mod)
clean_digit = knn_clf.predict([X_train_mod[idx]])

plt.imshow(clean_digit.reshape(28, 28))
plt.show()


from IPython.core.display import display, HTML
display(HTML("<style> .container{width:90% !important;}</style>"))

[Pyspark] Pyspark의 여러가지 함수들 (0)	2021.11.12
[Pyspark] udf에서 2개 이상의 multiple column 리턴하기 (0)	2021.11.04
[핸즈온 머신러닝2] Chaper 5. 서포트 벡터 머신 (0)	2021.01.24
[핸즈온 머신러닝2] Chaper 2. 머신러닝 프로젝트 처음부터 끝까지 (0)	2021.01.13

[핸즈온 머신러닝2] Chapter 3. 분류

분류¶

1. MNIST¶

2. 이진 분류기 훈련¶

3. 성능 측정¶

3.1 교차 검증을 사용한 정확도 측정¶

3.2 오차 행렬¶

3.3 정밀도와 재현율¶

3.4 정밀도/재현율 트레이드 오프¶

5. ROC 곡선¶

4. 다중 분류¶

5. 에러 분석¶

6. 다중 레이블 분류¶

7. 다중 출력 분류¶

'머신러닝 꿈나무 > Hands-on!' 카테고리의 다른 글

댓글

이 글 공유하기

티스토리툴바