Tiểu luận môn học máy

THÔNG TIN TÀI LIỆU

Thông tin cơ bản

Định dạng
Số trang	44
Dung lượng	2,22 MB

Nội dung

Nhập thư viện import pandas as pd import numpy as np import matplotlib.pyplot as plt from matplotlib import rcParams import seaborn as sns from sklearn.model_selection import train_test_split from sklearn.metrics import accuracy_score from sklearn.linear_model import LogisticRegression from sklearn.neighbors import KNeighborsClassifier from sklearn.tree import DecisionTreeClassifier from sklearn.ensemble import RandomForestClassifier from sklearn import metrics from sklearn.compose import ColumnTransformer from sklearn.preprocessing import LabelEncoder from sklearn.impute import SimpleImputer

NGÂN HÀNG NHÀ NƯỚC VIỆT NAM BỘ GIÁO DỤC VÀ ĐÀO TẠO TRƯỜNG ĐẠI HỌC NGÂN HÀNG TP HỒ CHÍ MINH - TIỂU LUẬN CUỐI KHĨA MƠN: HỌC MÁY SVTH: LA PHÚ HÀO MSSV: 050608200057 HỌC PHẦN: DAT704_221_8_L04 GVHD: Lê Hoàng Anh Tp Hồ Chí Minh, tháng 11 năm 2022 In [1]: # Nhập thư viện import pandas as pd import numpy as np import matplotlib.pyplot as plt from matplotlib import rcParams import seaborn as sns from sklearn.model_selection import train_test_split from sklearn.metrics import accuracy_score from sklearn.linear_model import LogisticRegression from sklearn.neighbors import KNeighborsClassifier from sklearn.tree import DecisionTreeClassifier from sklearn.ensemble import RandomForestClassifier from sklearn import metrics from sklearn.compose import ColumnTransformer from sklearn.preprocessing import LabelEncoder from sklearn.impute import SimpleImputer Tiến hành khai báo liệu Cho biết dạng liệu biến tập liệu? Cho biết thông tin biến bị thiếu liệu? In [2]: # Khai báo liệu data_df = pd.read_csv('/content/data_credit.csv') In [3]: # Xem giá trị dòng đầu liệu data_df.head() Out[3]: Current Loan Amount Term Credit Score Annual Income Years in current job Home Ownership Purpose Loan ID Customer ID Loan Status 14dd88316af5-400b83ec68e61888a048 981165ec3274-42f5a3b4d104041a9ca9 Fully Paid 445412 Short Term 709.0 1167493.0 years Home Mortgage Home Improvements 4771cc26131a-45dbb5aa537ea4ba5342 2de017a32e01-49cba58108169e83be29 Fully Paid 262328 Short Term NaN NaN 10+ years Home Mortgage Deb Consolidation 4eed4e6aaa2f-4c918651ce984ee8fb26 5efb2b2b-bf114dfd-a5723761a2694725 Fully Paid 99999999 Short Term 741.0 2231892.0 years Own Home Deb Consolidation 77598f7b32e7-4e3ba6e506ba0d98fe8a e777faab98ae-45af9a867ce5b33b1011 Fully Paid 347666 Long Term 721.0 806949.0 years Own Home Deb Consolidation d4062e70befa-49958643a0de73938182 81536ad95ccf-4eb8befb47a4d608658e Fully Paid 176220 Short Term NaN NaN years Rent Deb Consolidation In [4]: # Xem giá trị dòng cuối liệu data_df.tail() Out[4]: Current Loan Amount Term Credit Score Annual Income Years in current job Home Ownership Loan ID Customer ID Loan Status 99995 3f94c18c-ba8f45d0-861088a684a410a9 2da51983-cfef4b8f-a7335dfaf69e9281 Fully Paid 147070 Short Term 725.0 475437.0 years Own Home 99996 06eba04f-58fc424a-b666ed72aa008900 77f2252ab7d1-4b07a7461202a8304290 Fully Paid 99999999 Short Term 732.0 1289416.0 year Rent 99997 e1cb4050-eff54bdb-a1b0aabd3f7eaac7 2ced5f10bd60-4a119134cadce4e7b0a3 Fully Paid 103136 Short Term 742.0 1150545.0 years Rent 99998 81ab928bd1a5-45239a3c271ebb01b4fb 3e45ffda-99fd4cfc-b8b8446f4a505f36 Fully Paid 530332 Short Term 746.0 1717524.0 years Rent 99999 c63916c66d46-47a9949a51d09af4414f 1b3014be5c07-4d41abe744573c375886 Fully Paid 99999999 Short Term 743.0 935180.0 NaN Own Home In [5]: # Xem thông tin index, kiểu liệu dung lượng liệu data_df.info() RangeIndex: 100000 entries, to 99999 Data columns (total 19 columns): # Column Non-Null Count Dtype - - - Loan ID 100000 non-null object Customer ID 100000 non-null object Loan Status 100000 non-null object Current Loan Amount 100000 non-null int64 Term 100000 non-null object Credit Score 80846 non-null float64 Annual Income 80846 non-null float64 Years in current job 95778 non-null object Home Ownership 100000 non-null object Purpose 100000 non-null object 10 Monthly Debt 100000 non-null float64 11 Years of Credit History 100000 non-null float64 12 Months since last delinquent 46859 non-null float64 13 Number of Open Accounts 100000 non-null int64 14 Number of Credit Problems 100000 non-null int64 15 Current Credit Balance 100000 non-null int64 16 Maximum Open Credit 99998 non-null float64 17 Bankruptcies 99796 non-null float64 18 Tax Liens 99990 non-null float64 dtypes: float64(8), int64(4), object(7) memory usage: 14.5+ MB Pur Consolid Consolid Consolid Consolid In [6]: # Xác định kiểu liệu biến liệu data_df.dtypes Out[6]: Loan ID object Customer ID object Loan Status object Current Loan Amount int64 Term object Credit Score float64 Annual Income float64 Years in current job object Home Ownership object Purpose object Monthly Debt float64 Years of Credit History float64 Months since last delinquent float64 Number of Open Accounts int64 Number of Credit Problems int64 Current Credit Balance int64 Maximum Open Credit float64 Bankruptcies float64 Tax Liens float64 dtype: object In [7]: # Xem mức độ phân phối biến liệu data_df.hist(figsize=(40, 40)) Out[7]: array([[, , ], [, , ], [, , ], [, , ]], dtype=object) In [8]: # Kiểm tra liệu có bị null hay khơng data_df.isnull().sum() Out[8]: Loan ID Customer ID Loan Status Current Loan Amount Term Credit Score 19154 Annual Income 19154 Years in current job 4222 Home Ownership Purpose Monthly Debt Years of Credit History Months since last delinquent 53141 Number of Open Accounts Number of Credit Problems Current Credit Balance Maximum Open Credit Bankruptcies 204 Tax Liens 10 dtype: int64 In [9]: # Thống kê % liệu bị thiếu biến liệu for col in data_df.columns: missing_data = data_df[col].isna().sum() missing_percent = missing_data/len(data_df)*100 print(f'Columns {col}: has {missing_percent}% missing data') Columns Columns Columns Columns Columns Columns Columns Columns Columns Columns Columns Columns Columns Columns Columns Columns Columns Columns Columns Loan ID: has 0.0% missing data Customer ID: has 0.0% missing data Loan Status: has 0.0% missing data Current Loan Amount: has 0.0% missing data Term: has 0.0% missing data Credit Score: has 19.154% missing data Annual Income: has 19.154% missing data Years in current job: has 4.222% missing data Home Ownership: has 0.0% missing data Purpose: has 0.0% missing data Monthly Debt: has 0.0% missing data Years of Credit History: has 0.0% missing data Months since last delinquent: has 53.141000000000005% missing data Number of Open Accounts: has 0.0% missing data Number of Credit Problems: has 0.0% missing data Current Credit Balance: has 0.0% missing data Maximum Open Credit: has 0.002% missing data Bankruptcies: has 0.20400000000000001% missing data Tax Liens: has 0.01% missing data In [10]: # Trực quan hóa bị thiếu tập liệu fig, ax = plt.subplots(figsize=(80,70)) sns.heatmap(data_df.isna(), cmap='Blues', cbar=False, yticklabels=False) Out[10]: In [11]: # Khởi tạo model Encoder le = LabelEncoder() In [12]: # Mã hóa liệu data_df['Loan Status'] = le.fit_transform(data_df['Loan Status']) data_df['Term'] = le.fit_transform(data_df['Term']) data_df['Years in current job'] = le.fit_transform(data_df['Years in current job']) data_df['Home Ownership'] = le.fit_transform(data_df['Home Ownership']) data_df['Purpose'] = le.fit_transform(data_df['Purpose']) In [13]: # Xử lí liệu bị thiếu theo phương pháp lấy giá trị trước cho biến 'Years in current job' data_df['Years in current job'].fillna(method='pad') Out[13]: 8 99995 99996 99997 99998 99999 11 Name: Years in current job, Length: 100000, dtype: int64 In [14]: # Kiểm tra liệu biến 'Years in current job' encoder có bị null hay khơng data_df['Years in current job'].isnull().sum() Out[14]: In [15]: # Tính giá trị trung bình cho bị thiếu liệu biến 'Credit Score' 'Annual Income', 'Months since last delin imputer = SimpleImputer(missing_values = np.nan, strategy='mean') imputer.fit(data_df.iloc[:, [5, 6, 12, 16, 17, 18]]) data_df.iloc[:, [5, 6, 12, 16, 17, 18]] = imputer.fit_transform(data_df.iloc[:, [5, 6, 12, 16, 17, 18]]) In [16]: # Xem giá trị biến 'Credit Score' 'Annual Income', 'Months since last delinquent', 'Maximum Open Credi data_df.iloc[:, [5, 6, 12, 16, 17, 18]] Out[16]: Credit Score Annual Income Months since last delinquent Maximum Open Credit Bankruptcies Tax Liens 709.000000 1.167493e+06 34.901321 416746.0 1.0 0.0 1076.456089 1.378277e+06 8.000000 850784.0 0.0 0.0 741.000000 2.231892e+06 29.000000 750090.0 0.0 0.0 721.000000 8.069490e+05 34.901321 386958.0 0.0 0.0 1076.456089 1.378277e+06 34.901321 427174.0 0.0 0.0 99995 725.000000 4.754370e+05 34.901321 658548.0 0.0 0.0 99996 732.000000 1.289416e+06 21.000000 509234.0 0.0 0.0 99997 742.000000 1.150545e+06 18.000000 537548.0 1.0 0.0 99998 746.000000 1.717524e+06 34.901321 738254.0 0.0 0.0 99999 743.000000 9.351800e+05 34.901321 91014.0 1.0 0.0 100000 rows × columns In [17]: # Kiểm tra lại xem biến 'Credit Score' 'Annual Income', 'Months since last delinquent', 'Maximum Open Cred data_df.iloc[:, [5, 6, 12, 16, 17, 18]].isnull().sum() Out[17]: Credit Score Annual Income Months since last delinquent Maximum Open Credit Bankruptcies Tax Liens dtype: int64 In [87]: # Huấn luyện mơ hình KNNmodel2.fit(X2_train, y2_train) Out[87]: KNeighborsClassifier(n_neighbors=31) In [88]: # Dự đoán giá trị tập train y2hatKNN_train = knn2.predict(X2_train) In [89]: # Kiểm tra độ xác Accuracy print('Accuracy in train dataset:', accuracy_score(y2_train, y2hatKNN_train)) print('Accuracy in test dataset:', accuracy_score(y2_test, y2hatKNN_test)) Accuracy in train dataset: 0.7739 Accuracy in test dataset: 0.7729333333333334 In [90]: # Ma trận nhầm lẫn confusion_matrix = pd.crosstab(y2_test, y2hatKNN_test, rownames=['Actual'], colnames=['Predicted']) sns.heatmap(confusion_matrix, annot=True, cmap='Blues') print('Accuracy: ', metrics.accuracy_score(y2_test, y2hatKNN_test)) plt.show() Accuracy: 0.7729333333333334 Decision Tree In [91]: # Xây dựng mơ hình DTmodel2 = DecisionTreeClassifier() In [92]: # Huấn luyện mơ hình DTmodel2.fit(X2_train, y2_train) Out[92]: DecisionTreeClassifier() In [93]: # Dự đoán giá trị tập train tập test y2hatDT_train = DTmodel2.predict(X2_train) y2hatDT_test = DTmodel2.predict(X2_test) In [94]: # Kiểm tra độ xác Accuracy print('Accuracy in train dataset:', accuracy_score(y2_train, y2hatDT_train)) print('Accuracy in test dataset:', accuracy_score(y2_test, y2hatDT_test)) Accuracy in train dataset: 0.9910428571428571 Accuracy in test dataset: 0.7497333333333334 In [95]: # Ma trận nhầm lẫn confusion_matrix = pd.crosstab(y2_test, y2hatDT_test, rownames=['Actual'], colnames=['Predicted']) sns.heatmap(confusion_matrix, annot=True, cmap ='Blues') print('Accuracy: ', metrics.accuracy_score(y2_test, y2hatDT_test)) plt.show() Accuracy: 0.7497333333333334 Random Forest In [96]: # Khởi tạo mơ hình RFmodel2 = RandomForestClassifier(n_estimators = 10) In [97]: # Huấn luyện mơ hình RFmodel2.fit(X2_train, y2_train) Out[97]: RandomForestClassifier(n_estimators=10) In [98]: # Dự đoán giá trị tập train tập test y2hatRF_train = RFmodel2.predict(X2_train) y2hatRF_test = RFmodel2.predict(X2_test) In [99]: # Kiểm tra độ xác Accuracy print('Accuracy in train dataset:', accuracy_score(y2_train, y2hatRF_train)) print('Accuracy in test dataset:', accuracy_score(y2_test, y2hatRF_test)) Accuracy in train dataset: 0.9782285714285714 Accuracy in test dataset: 0.7768333333333334 In [100]: lst2_n = [] # Danh sách giá trị n lst4_accuracy = [] # Danh sách độ xác tương ứng với giá trị n for n in range(10, 51): # Cho phạm vi chạy khoảng từ 10 đến 50 lst2_n.append(n) rf2 = RandomForestClassifier(n_estimators = n) rf2.fit(X2_train, y2_train) y2hatRF_test = rf2.predict(X2_test) accuracy4 = accuracy_score(y2_test, y2hatRF_test) * 100 lst4_accuracy.append(accuracy4) print('n =', n,'- Accuracy:', accuracy4,'%') n n n n n n n n n n n n n n n n n n n n n n n n n n n n n n n n n n n n n n n n n = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 - Accuracy: Accuracy: Accuracy: Accuracy: Accuracy: Accuracy: Accuracy: Accuracy: Accuracy: Accuracy: Accuracy: Accuracy: Accuracy: Accuracy: Accuracy: Accuracy: Accuracy: Accuracy: Accuracy: Accuracy: Accuracy: Accuracy: Accuracy: Accuracy: Accuracy: Accuracy: Accuracy: Accuracy: Accuracy: Accuracy: Accuracy: Accuracy: Accuracy: Accuracy: Accuracy: Accuracy: Accuracy: Accuracy: Accuracy: Accuracy: Accuracy: 77.61333333333333 % 78.68333333333334 % 78.03999999999999 % 78.66333333333333 % 78.32666666666667 % 78.79 % 78.58000000000001 % 78.93666666666667 % 78.7 % 78.94666666666666 % 78.72333333333333 % 79.04333333333334 % 78.86999999999999 % 79.19666666666667 % 78.81 % 79.22 % 78.90666666666667 % 79.14 % 78.97 % 79.25333333333333 % 78.90333333333334 % 79.22666666666667 % 78.85333333333332 % 79.22666666666667 % 78.95666666666666 % 78.99666666666667 % 79.16333333333333 % 79.21333333333334 % 79.05333333333333 % 79.22666666666667 % 79.08 % 79.25666666666666 % 78.94666666666666 % 79.27333333333333 % 79.16 % 79.27666666666666 % 79.10333333333334 % 79.27 % 79.08 % 79.28666666666668 % 79.14333333333333 % In [101]: vitri4 = lst4_accuracy.index(max(lst4_accuracy)) n2_best = lst2_n[vitri4] print('Giá trị n phù hợp là', n2_best, 'với accuracy là', lst4_accuracy[vitri4]) Giá trị n phù hợp 49 với accuracy 79.28666666666668 In [102]: # Xây dựng mô hình RFmodel2new = RandomForestClassifier(n_estimators = 49) In [103]: # Huấn luyện mơ hình RFmodel2new.fit(X2_train, y2_train) Out[103]: RandomForestClassifier(n_estimators=49) In [104]: # Dự đoán giá trị tập train tập test từ model y2hatRFnew_train = RFmodel2new.predict(X2_train) y2hatRFnew_test = RFmodel2new.predict(X2_test) In [105]: # Ma trận nhầm lẫn confusion_matrix = pd.crosstab(y2_test, y2hatRFnew_test, rownames=['Actual'], colnames=['Predicted']) sns.heatmap(confusion_matrix, annot = True, cmap = 'Blues') print('Accuracy: ', metrics.accuracy_score(y2_test, y2hatRFnew_test)) plt.show() Accuracy: 0.7927666666666666 Chia tập huấn luyện tập kiểm định theo tỷ lệ 60:40 In [106]: # Chia tập liệu X3_train, X3_test, y3_train, y3_test = train_test_split(X, y, test_size=0.4, random_state=42) Logistic Regression In [107]: # Xây dựng mơ hình LRmodel3 = LogisticRegression() In [108]: # Huấn luyện mơ hình LRmodel3.fit(X3_train, y3_train) Out[108]: LogisticRegression() In [109]: # Dự đoán giá trị tập train tập test y3hatLR_train = LRmodel3.predict(X3_train) y3hatLR_test = LRmodel3.predict(X3_test) In [110]: # Kiểm tra độ xác Accuracy print('Accuracy in train dataset:', accuracy_score(y3_train, y3hatLR_train)) print('Accuracy in test dataset:', accuracy_score(y3_test, y3hatLR_test)) Accuracy in train dataset: 0.8153333333333334 Accuracy in test dataset: 0.814075 In [111]: # Ma trận nhầm lẫn confusion_matrix = pd.crosstab(y3_test, y3hatLR_test, rownames = ['Actual'], colnames = ['Predicted']) sns.heatmap(confusion_matrix, annot=True, cmap='Blues') print('Accuracy: ', metrics.accuracy_score(y3_test, y3hatLR_test)) plt.show() Accuracy: 0.814075 KNN In [112]: # Tính toán giá trị k phù hợp np.sqrt(X3_train.shape[0])/2 Out[112]: 122.47448713915891 In [113]: # k = 122 nhập vào range giá trị k = 123 In [114]: lst3_k = [] # Danh sách giá trị k lst5_accuracy = [] # Danh sách độ xác tương ứng với giá trị k for k in range(2, 123): lst3_k.append(k) knn3 = KNeighborsClassifier(n_neighbors = k) knn3.fit(X3_train, y3_train) y3hatKNN_test = knn3.predict(X3_test) accuracy5 = accuracy_score(y_true = y3_test, y_pred = y3hatKNN_test) * 100 lst5_accuracy.append(accuracy5) print('k =', k,'- Accuracy:', accuracy5,'%') k k k k k k k k k k k k k k k k k k k k = = = = = = = = = = = = = = = = = = = - Accuracy: 73.91250000000001 % - Accuracy: 70.1325 % - Accuracy: 75.015 % - Accuracy: 72.9175 % - Accuracy: 75.7375 % - Accuracy: 74.465 % - Accuracy: 76.14 % 10 - Accuracy: 75.31 % 11 - Accuracy: 76.3925 % 12 - Accuracy: 75.86500000000001 % 13 - Accuracy: 76.5975 % 14 - Accuracy: 76.2475 % 15 - Accuracy: 76.8025 % 16 - Accuracy: 76.505 % 17 - Accuracy: 76.89 % 18 - Accuracy: 76.75999999999999 % 19 - Accuracy: 76.9425 % 20 - Accuracy: 76.825 % 21 - Accuracy: 77.0275 % 22 Accuracy: 76 94 % In [115]: vitri5 = lst5_accuracy.index(max(lst5_accuracy)) k3_best = lst3_k[vitri5] print('Giá trị k phù hợp là', k3_best, 'với accuracy là', lst5_accuracy[vitri5]) Giá trị k phù hợp 69 với accuracy 77.185 In [116]: # Khởi tạo mơ hình KNNmodel3 = KNeighborsClassifier(n_neighbors = 69) In [117]: # Huấn luyện mơ hình KNNmodel3.fit(X3_train, y3_train) Out[117]: KNeighborsClassifier(n_neighbors=69) In [118]: # Dự đoán giá trị tập train y3hatKNN_train = knn3.predict(X3_train) In [119]: # Kiểm tra độ xác Accuracy print('Accuracy in train dataset:', accuracy_score(y3_train, y3hatKNN_train)) print('Accuracy in test dataset:', accuracy_score(y3_test, y3hatKNN_test)) Accuracy in train dataset: 0.7749 Accuracy in test dataset: 0.77175 In [120]: # Ma trận nhầm lẫn confusion_matrix = pd.crosstab(y3_test, y3hatKNN_test, rownames=['Actual'], colnames=['Predicted']) sns.heatmap(confusion_matrix, annot=True, cmap='Blues') print('Accuracy: ', metrics.accuracy_score(y3_test, y3hatKNN_test)) plt.show() Accuracy: 0.77175 Decision Tree In [121]: # Xây dựng mơ hình DTmodel3 = DecisionTreeClassifier() In [122]: # Huấn luyện mơ hình DTmodel3.fit(X3_train, y3_train) Out[122]: DecisionTreeClassifier() In [123]: # Dự đoán giá trị tập train tập test y3hatDT_train = DTmodel3.predict(X3_train) y3hatDT_test = DTmodel3.predict(X3_test) In [124]: # Kiểm tra độ xác Accuracy print('Accuracy in train dataset:', accuracy_score(y3_train, y3hatDT_train)) print('Accuracy in test dataset:', accuracy_score(y3_test, y3hatDT_test)) Accuracy in train dataset: 0.99195 Accuracy in test dataset: 0.7467 In [125]: # Ma trận nhầm lẫn confusion_matrix = pd.crosstab(y3_test, y3hatDT_test, rownames=['Actual'], colnames=['Predicted']) sns.heatmap(confusion_matrix, annot=True, cmap ='Blues') print('Accuracy: ', metrics.accuracy_score(y3_test, y3hatDT_test)) plt.show() Accuracy: 0.7467 Random Forest In [126]: # Khởi tạo mơ hình RFmodel3 = RandomForestClassifier(n_estimators = 10) In [127]: # Huấn luyện mơ hình RFmodel3.fit(X3_train, y3_train) Out[127]: RandomForestClassifier(n_estimators=10) In [128]: # Dự đoán giá trị tập train tập test y3hatRF_train = RFmodel3.predict(X3_train) y3hatRF_test = RFmodel3.predict(X3_test) In [ ]: # Kiểm tra độ xác Accuracy print('Accuracy in train dataset:', accuracy_score(y3_train, y3hatRF_train)) print('Accuracy in test dataset:', accuracy_score(y3_test, y3hatRF_test)) In [130]: lst3_n = [] # Danh sách giá trị n lst6_accuracy = [] # Danh sách độ xác tương ứng với giá trị n for n in range(10, 51): # Cho phạm vi chạy khoảng từ 10 đến 50 lst3_n.append(n) rf3 = RandomForestClassifier(n_estimators = n) rf3.fit(X3_train, y3_train) y3hatRF_test = rf3.predict(X3_test) accuracy6 = accuracy_score(y3_test, y3hatRF_test) * 100 lst6_accuracy.append(accuracy6) print('n =', n,'- Accuracy:', accuracy6,'%') n n n n n n n n n n n n n n n n n n n n n n n n n n n n n n n n n n n n n n n n n = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 - Accuracy: Accuracy: Accuracy: Accuracy: Accuracy: Accuracy: Accuracy: Accuracy: Accuracy: Accuracy: Accuracy: Accuracy: Accuracy: Accuracy: Accuracy: Accuracy: Accuracy: Accuracy: Accuracy: Accuracy: Accuracy: Accuracy: Accuracy: Accuracy: Accuracy: Accuracy: Accuracy: Accuracy: Accuracy: Accuracy: Accuracy: Accuracy: Accuracy: Accuracy: Accuracy: Accuracy: Accuracy: Accuracy: Accuracy: Accuracy: Accuracy: 77.78 % 78.5875 % 78.105 % 78.78500000000001 % 78.38000000000001 % 78.845 % 78.48 % 78.96 % 78.7325 % 79.0475 % 78.6425 % 79.0 % 78.6525 % 79.06 % 78.88 % 79.05 % 78.69500000000001 % 78.9825 % 79.0175 % 79.185 % 79.03750000000001 % 79.1125 % 79.0075 % 79.295 % 79.0575 % 79.1225 % 78.96 % 79.23750000000001 % 79.0775 % 79.17999999999999 % 79.23249999999999 % 79.2775 % 79.205 % 79.2925 % 79.16499999999999 % 79.24 % 79.1125 % 79.2775 % 79.1925 % 79.28500000000001 % 79.1375 % In [131]: vitri6 = lst6_accuracy.index(max(lst6_accuracy)) n3_best = lst3_n[vitri6] print('Giá trị n phù hợp là', n3_best, 'với accuracy là', lst6_accuracy[vitri6]) Giá trị n phù hợp 33 với accuracy 79.295 In [132]: # Xây dựng mơ hình RFmodel3new = RandomForestClassifier(n_estimators = 33) In [133]: # Huấn luyện mơ hình RFmodel3new.fit(X3_train, y3_train) Out[133]: RandomForestClassifier(n_estimators=33) In [134]: # Dự đoán giá trị tập train tập test từ model y3hatRFnew_train = RFmodel3new.predict(X3_train) y3hatRFnew_test = RFmodel3new.predict(X3_test) In [135]: # Ma trận nhầm lẫn confusion_matrix = pd.crosstab(y3_test, y3hatRFnew_test, rownames=['Actual'], colnames=['Predicted']) sns.heatmap(confusion_matrix, annot = True, cmap = 'Blues') print('Accuracy: ', metrics.accuracy_score(y3_test, y3hatRFnew_test)) plt.show() Accuracy: 0.7929 Lựa chọn mơ hình In [136]: # Tạo danh sách Accuracy Score mô hình lst_accuracy_score = [ accuracy_score(y1_test, y1hatLR_test), accuracy_score(y1_test, y1hatKNN_test), accuracy_score(y1_test, y1hatDT_test), accuracy_score(y1_test, y1hatRF_test), accuracy_score(y2_test, y2hatLR_test), accuracy_score(y2_test, y2hatKNN_test), accuracy_score(y2_test, y2hatDT_test), accuracy_score(y2_test, y2hatRF_test), accuracy_score(y3_test, y3hatLR_test), accuracy_score(y3_test, y3hatKNN_test), accuracy_score(y3_test, y3hatDT_test), accuracy_score(y3_test, y3hatRF_test)] In [137]: # Sắp xếp Accuracy Score từ cao xuống thấp lst_accuracy_score = sorted(lst_accuracy_score, reverse = True) lst_accuracy_score Out[137]: [0.81525, 0.8149666666666666, 0.814075, 0.7914333333333333, 0.791375, 0.79105, 0.77345, 0.7729333333333334, 0.77175, 0.75245, 0.7497333333333334, 0.7467] In [138]: lst_accuracy_score = pd.DataFrame(data = lst_accuracy_score, columns = ['Accuracy Score']) lst_accuracy_score Out[138]: Accuracy Score 0.815250 0.814967 0.814075 0.791433 0.791375 0.791050 0.773450 0.772933 0.771750 0.752450 10 0.749733 11 0.746700 KẾT LUẬN: Qua đánh giá mức độ xác Accuracy Score từ mơ hình trên, kết cho thấy mơ hình Logistic Regression chia theo tỷ lệ 80:20 có độ xác cao 81.52% Như vậy, chọn mơ hình Logistic Regression với tỷ lệ 80:20

Ngày đăng: 24/06/2023, 10:59