본문 바로가기
코딩코딩/머신러닝, 딥러닝

빅데이터분석기사 실기 준비 - 분류 모델 11개 비교

by g0n1 2021. 6. 16.
728x90
import pandas as pd
import numpy as np
import time
from sklearn.model_selection import StratifiedKFold as SKF
from sklearn.metrics import roc_auc_score,f1_score,precision_score,accuracy_score
X_train = pd.read_csv('X_train.csv',encoding='cp949')
y_train = pd.read_csv('y_train.csv',encoding='cp949')
X_train = X_train.drop('cust_id',1).fillna(0)
y_train = y_train['gender']

X_train = pd.concat([X_train.drop(['주구매상품','주구매지점'], axis=1),\
           pd.get_dummies(X_train[['주구매상품','주구매지점']])],1)
import xgboost
xgb_clf = xgboost.XGBClassifier()

from sklearn.linear_model import LogisticRegression
log_clf = LogisticRegression()

from sklearn.ensemble import RandomForestClassifier
rf_clf = RandomForestClassifier()

from sklearn.svm import LinearSVC as SVC
svc = SVC()

from sklearn.ensemble import GradientBoostingClassifier as GBC
gbc = GBC()

from sklearn.naive_bayes import GaussianNB 
nb = GaussianNB()

from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
lda = LDA()

from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis as QDA
qda = QDA()

from sklearn.tree import DecisionTreeClassifier as DT
dt = DT()

from sklearn.neighbors import KNeighborsClassifier as KNN

knn = KNN()

from sklearn.linear_model import SGDClassifier as SGD
sgd = SGD()


models = [xgb_clf, log_clf, rf_clf, svc, gbc, nb, lda, qda, dt, knn, sgd]
skf = SKF(n_splits=10, shuffle=True, random_state=5252)

folds = []
for train_idx, valid_idx in skf.split(X_train, y_train):
    folds.append((train_idx,valid_idx))

results = []
for model in models:
    model_name = str(model).split('(')[0]
    roc_auc = 0
    f1 = 0
    prec = 0
    acc = 0
    s = time.time()
    for train_idx, valid_idx in folds:
        train_x = X_train.iloc[train_idx].values
        valid_x = X_train.iloc[valid_idx].values
        
        train_y = y_train.iloc[train_idx].values
        valid_y = y_train.iloc[valid_idx].values
        
        model.fit(train_x, train_y)
        preds = model.predict(valid_x)
        
        roc_auc += roc_auc_score(valid_y, preds)/10
        f1 += f1_score(valid_y, preds)/10
        prec += precision_score(valid_y, preds)/10
        acc += accuracy_score(valid_y, preds)/10
        
    e = time.time()
    sec = str(e-s)[:5]
    results.append([model_name, sec, acc, roc_auc, f1, prec])
results.sort(key = lambda x: x[1])
pd.DataFrame(results, columns=['model', 'time(sec)', 'acc','roc_auc','f1','prec'])

 

728x90

댓글