1. HAR data
1.1 HAR data load
1
2
3
4
5
6
7
8
9
10
11
12
13
14
import pandas as pd
import matplotlib.pyplot as plt
url = 'https://raw.githubusercontent.com/hmkim312/datas/main/HAR/features.txt'
feature_name_df = pd.read_csv(url, sep = '\s+', header = None, names = ['column_index','column_name'])
feature_name = feature_name_df.iloc[:,1].values.tolist()
X_train = pd.read_csv('https://raw.githubusercontent.com/hmkim312/datas/main/HAR/X_train.txt', sep = '\s+', header = None)
X_test = pd.read_csv('https://raw.githubusercontent.com/hmkim312/datas/main/HAR/X_test.txt', sep = '\s+', header = None)
y_train = pd.read_csv('https://raw.githubusercontent.com/hmkim312/datas/main/HAR/y_train.txt', sep = '\s+', header = None, names = ['action'])
y_test = pd.read_csv('https://raw.githubusercontent.com/hmkim312/datas/main/HAR/y_test.txt', sep = '\s+', header = None, names = ['action'])
X_train.columns = feature_name
X_test.columns = feature_name
X_train.shape, X_test.shape, y_train.shape, y_test.shape
1
((7352, 561), (2947, 561), (7352, 1), (2947, 1))
1.2 함수 생성
1
2
3
4
5
6
7
from sklearn.decomposition import PCA
def get_pca_data(ss_data, n_components = 2):
pca = PCA(n_components= n_components)
pca.fit(ss_data)
return pca.transform(ss_data), pca
1.3 PCA fit
1
2
HAR_pca, pca = get_pca_data(X_train, n_components= 2)
HAR_pca.shape
1
(7352, 2)
1
pca.mean_.shape, pca.components_.shape
1
((561,), (2, 561))
1.4 PCA 갯수 조절을 위한 컬럼명 생성
1
2
cols = ['pca_' + str(n) for n in range(pca.components_.shape[0])]
cols
1
['pca_0', 'pca_1']
1.5 PCA 결과를 저장하는 함수
1
2
3
def get_pd_from_pca(pca_data, col_num):
cols = ['pca_'+str(n) for n in range(col_num)]
return pd.DataFrame(pca_data, columns=cols)
1.6 components 2개
1
2
3
4
HAR_pca, pca = get_pca_data(X_train, n_components=2)
HAR_pd_pca = get_pd_from_pca(HAR_pca, pca.components_.shape[0])
HAR_pd_pca['action'] = y_train
HAR_pd_pca.head()
pca_0 | pca_1 | action | |
---|---|---|---|
0 | -5.520280 | -0.290278 | 5 |
1 | -5.535350 | -0.082530 | 5 |
2 | -5.474988 | 0.287387 | 5 |
3 | -5.677232 | 0.897031 | 5 |
4 | -5.748749 | 1.162952 | 5 |
1.7 그래프로 그려보기
1
2
3
4
import seaborn as sns
sns.pairplot(HAR_pd_pca, hue='action', height=5,
x_vars=['pca_0'], y_vars=['pca_1'])
plt.show()
- 액션이 제대로 나뉘어 진거같지 않아보인다.
- 성능이 좋아보이진 않음
1.8 전체 500개가 넘는 특성을 2개로 줄이면
1
2
3
4
5
6
7
import numpy as np
def print_variance_ratio(pca):
print('variance_ratio : ', pca.explained_variance_ratio_)
print('sum of variance_ratio : ', np.sum(pca.explained_variance_ratio_))
print_variance_ratio(pca)
1
2
variance_ratio : [0.6255444 0.04913023]
sum of variance_ratio : 0.6746746270487833
- 전체 500개 특성을 2개로 줄이면 약 0.67%의 설명력을 가진다.
1.9 3개의 특성은?
1
2
3
4
5
HAR_pca, pca = get_pca_data(X_train, n_components=3)
HAR_pd_pca = get_pd_from_pca(HAR_pca, pca.components_.shape[0])
HAR_pd_pca['action'] = y_train
print_variance_ratio(pca)
1
2
variance_ratio : [0.6255444 0.04913023 0.04121467]
sum of variance_ratio : 0.7158893015785988
- 500개의 특성을 3개로 줄이면 0.71%의 설명력을 가진다.
1.10 10개 특성은?
1
2
3
4
5
HAR_pca, pca = get_pca_data(X_train, n_components=10)
HAR_pd_pca = get_pd_from_pca(HAR_pca, pca.components_.shape[0])
HAR_pd_pca['action'] = y_train
print_variance_ratio(pca)
1
2
3
variance_ratio : [0.6255444 0.04913023 0.04121467 0.01874956 0.0169486 0.01272069
0.01176685 0.01068973 0.00969377 0.00858014]
sum of variance_ratio : 0.8050386453768614
- 10개 특성으로 하면 약 80%의 설명력을 가진다
1.11 랜덤포레스트로 학습해보기
1
2
3
4
5
6
7
8
9
10
11
12
13
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
params = {
'max_depth': [6, 8, 10],
'n_estimators': [50, 100, 200],
'min_samples_leaf': [8, 12],
'min_samples_split': [8, 12]
}
rf_clf = RandomForestClassifier(random_state=13, n_jobs=-1)
grid_cv = GridSearchCV(rf_clf, param_grid=params, cv=2, n_jobs=-1)
grid_cv.fit(HAR_pca, y_train.values.reshape(-1,))
1
2
3
4
5
GridSearchCV(cv=2, estimator=RandomForestClassifier(n_jobs=-1, random_state=13),
n_jobs=-1,
param_grid={'max_depth': [6, 8, 10], 'min_samples_leaf': [8, 12],
'min_samples_split': [8, 12],
'n_estimators': [50, 100, 200]})
1.12 성능 확인
1
2
3
cv_results_df = pd.DataFrame(grid_cv.cv_results_)
target_col = ['rank_test_score', 'mean_test_score', 'param_n_estimators', 'param_max_depth']
cv_results_df[target_col].sort_values('rank_test_score').head()
rank_test_score | mean_test_score | param_n_estimators | param_max_depth | |
---|---|---|---|---|
17 | 1 | 0.838547 | 200 | 8 |
14 | 1 | 0.838547 | 200 | 8 |
32 | 3 | 0.837867 | 200 | 10 |
35 | 3 | 0.837867 | 200 | 10 |
26 | 5 | 0.837595 | 200 | 10 |
- 500개 특성 다 사용하였을때 LGBM으로 accuracy가 0.93으로 나왔던것을 생각하면 성능은 accuracy가 0.83으로 조금 낮다
1.13 Best 파라미터
1
grid_cv.best_params_
1
2
3
4
{'max_depth': 8,
'min_samples_leaf': 8,
'min_samples_split': 8,
'n_estimators': 200}
1
grid_cv.best_score_
1
0.8385473340587595
1.14 테스트 데이터에 적용해보기
1
2
3
4
5
6
7
from sklearn.metrics import accuracy_score
rf_clf_best = grid_cv.best_estimator_
rf_clf_best.fit(HAR_pca, y_train.values.reshape(-1,))
pred1 = rf_clf_best.predict(pca.transform(X_test))
accuracy_score(y_test, pred1)
1
0.8530709195792331
1.15 일전에 시간이 많이 걸린 xgboost는 얼마나 걸릴까?
1
2
3
4
5
6
7
8
9
10
11
import time
from xgboost import XGBClassifier
evals = [(pca.transform(X_test), y_test)]
start_time = time.time()
xgb = XGBClassifier(n_estimators=400, learning_rate=0.1, max_depth=3)
xgb.fit(HAR_pca, y_train.values.reshape(-1,),
early_stopping_rounds=10, eval_set=evals) #
print('Fit time :', time.time() - start_time)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
[0] validation_0-merror:0.22531
Will train until validation_0-merror hasn't improved in 10 rounds.
[1] validation_0-merror:0.22192
[2] validation_0-merror:0.20461
[3] validation_0-merror:0.20394
[4] validation_0-merror:0.20156
[5] validation_0-merror:0.20394
[6] validation_0-merror:0.19783
...
[129] validation_0-merror:0.13607
[130] validation_0-merror:0.13675
[131] validation_0-merror:0.13743
[132] validation_0-merror:0.13709
[133] validation_0-merror:0.13777
[134] validation_0-merror:0.13743
[135] validation_0-merror:0.13675
[136] validation_0-merror:0.13607
Stopping. Best iteration:
[126] validation_0-merror:0.13505
Fit time : 1.1125271320343018
- 시간도 많이 줄었다.
1.16 성능 확인
1
accuracy_score(y_test, xgb.predict(pca.transform(X_test)))
1
0.8649474041398032
- 속도는 진짜 빨라졌으나, 성능은 당연히 높지는 않다