배고픈 개발자 이야기

[2021/08/25~26] 파이썬 머신러닝 랜덤포레스트 본문

인포섹 아카데미

[2021/08/25~26] 파이썬 머신러닝 랜덤포레스트

이융희 2021. 8. 25. 17:56
728x90

사용할 모듈추가

from sklearn.datasets import load_iris
from sklearn.metrics import accuracy_score, classification_report
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
import numpy as np
import pandas as pd

 

데이터 로드 및 쪼개기

iris = load_iris()

x_train = iris.data[:-30]
y_train = iris.target[:-30]

x_test = iris.data[-30:]
y_test = iris.target[-30:]

 

쪼갠 데이터로 학습 및 예측, 의사결정트리 10개 사용

rfc = RandomForestClassifier(n_estimators=10)

rfc.fit(x_train, y_train)
prediction = rfc.predict(x_test)
# rfc.score(x_test, y_test)
print("Accuracy is :", accuracy_score(prediction, y_test))
print(classification_report(y_test, prediction))

 

함수로 데이터 8:2 비율 쪼개고 예측하기

x = iris.data
y = iris.target
X_train, X_test, Y_train, Y_test = train_test_split(x, y, test_size=0.2)

prediction_1 = rfc.predict(X_test)
print("Accuracy is :", accuracy_score(prediction_1, Y_test))
print(classification_report(Y_test, prediction_1))

 

의사결정트리 200개 사용, 특징 4개 사용, oob score 사용

rfc_2 = RandomForestClassifier(n_estimators=200,
                               max_features=4,
                               oob_score=True)
rfc_2.fit(X_train, Y_train)
prediction_2 = rfc_2.predict(X_test)

print("Accuracy is :", accuracy_score(prediction_2, Y_test))
print(classification_report(Y_test, prediction_2))

 

for feature, imp in zip(iris.feature_names, rfc_2.feature_importances_):
  print(feature, imp)

 

import pandas as pd

feature_name_df = pd.read_csv('./human_activity_features.txt',
                               sep='\s+',
                               header=None,
                               names=['column_index', 'column_name'])
feature_name = feature_name_df.iloc[:, 1].values.tolist()

X_train = pd.read_csv('./human_activity_X_train.txt', sep='\s+', header=None)
X_train.columns = feature_name

X_test = pd.read_csv('./human_activity_X_test.txt', sep='\s+', header=None)
X_test.columns = feature_name

y_train = pd.read_csv('./human_activity_y_train.txt', sep='\s+', header=None, names=['action'])
y_test = pd.read_csv('./human_activity_y_test.txt', sep='\s+', header=None, names=['action'])

print(y_train["action"].value_counts())
print(X_train.isnull().sum())

rf_clf = RandomForestClassifier()
rf_clf.fit(X_train, y_train)
prediction_3 = rf_clf.predict(X_test)
accuracy = accuracy_score(y_test, prediction_3)
print('랜덤 포레스트 정확도: {}'.format(accuracy))

 

랜덤 포레스트 성능향상

from sklearn.model_selection import GridSearchCV

params = {
    'n_estimators': [100], # 트리 100개
    'max_depth': [6, 8, 10, 12], # 최대 트리의 깊이 6, 8, 10, 12
    'min_samples_leaf': [8, 12, 18], # 최소 트리잎의 개수
    'min_samples_split': [8, 16, 20] # 전체 데이터의 로우 중에서 최소
                                     # 몇개의 데이터로 트리를 만들것인가
}
# cpu를 10개로 분할하여 동시에 테스트
rf_clf = RandomForestClassifier(n_jobs=10)
# gridsearchcv 객체를 이용해서 랜덤포레스트 객체의 최적의 파라메터를 찾음 
grid_cv = GridSearchCV(rf_clf, param_grid=params, n_jobs=10)

import warnings

warnings.filterwarnings('ignore')

grid_cv.fit(X_train, y_train)
print('최고 하이퍼 파라미터:\n', grid_cv.best_params_)
print('최고 예측 정확도: {0:.4f}'.format(grid_cv.best_score_))
# 129~

 

 

랜덤포레스트 컬럼중요도

rf_clf1 = RandomForestClassifier(n_estimators=300,
                                 max_depth=10,
                                 min_samples_leaf=8,
                                 min_samples_split=8)
rf_clf1.fit(X_train, y_train)
pred = rf_clf1.predict(X_test)
print('예측 정확도: {}'.format(accuracy_score(y_test, pred)))

 

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

ftr_importances_values = rf_clf1.feature_importances_
ftr_importances = pd.Series(ftr_importances_values, index=X_train.columns)
ftr_top20 = ftr_importances.sort_values(ascending=False)[:20]

plt.figure(figsize=(8, 6))
plt.title('Feature importances Top 20')
sns.barplot(x=ftr_top20, y=ftr_top20.index)
plt.show()

 

Comments