Developer's Development

3.2.17 [머신러닝] etc 본문

데이터 분석과 머신러닝, 딥러닝/머신러닝

3.2.17 [머신러닝] etc

mylee 2025. 8. 3. 21:02
머신러닝 모델 저장 (joblib)

 

pip install joblib
# 데이터 로드 ~ 모델 학습 (테스트용)
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

wine_df = pd.read_csv('./data/wine_simple.csv')
X = wine_df.drop('class', axis=1)
y = wine_df['class']

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

model = RandomForestClassifier(n_estimators=1000)
model.fit(X_train, y_train)
from joblib import dump
from joblib import load

dump(model, 'rf_model.joblib')	# 모델 저장
loaded_model = load('rf_model.joblib')  # 모델 로드

model_pred = model.predict(X_test)
loaded_pred = loaded_model.predict(X_test)

np.array_equal(model_pred, loaded_pred)  # True /완벽히 같은 결과

 

 

 

가상 데이터 생성 (Faker)

 

https://pypi.org/project/Faker/

 

Faker

Faker is a Python package that generates fake data for you.

pypi.org

pip install faker
from faker import Faker

fake = Faker()
print(fake.name())
print(fake.first_name(), ",", fake.last_name())
print(fake.email())
print(fake.company())
print(fake.date_of_birth())
# 가상 데이터를 생성하여 DataFrame으로 반환하는 함수
import pandas as pd
import random

fake = Faker()

def generate_customer_data(n=1000):
    data = []

    for i in range(n):
        name = fake.name()                                              # name: 이름
        address = fake.address().replace(',', ' ').replace('\n', '_')   # address: 주소
        age = fake.random_int(min=19, max=60)                           # age: 나이 (19세~60세)
        sub_months = fake.random_int(min=1, max=120)                    # sub_months: 구독 개월 수 (최소 1개월, 최대 10년)
        monthly_payment = round(random.uniform(10, 100), 2)             # monthly_payment: 월 결제 금액 (10달러~100달러) / 소수점 둘째자리
        usage_frequency = fake.random_int(min=1, max=30)                # usage_frequency: 월간 사용 빈도 (최소 1회, 최대 30회)
        stft_score = fake.random_int(min=1, max=5)                      # stft_score: 고객 만족도 점수 (1~5점)
        churn = fake.boolean(chance_of_getting_true=30)                 # churn: 고객 이탈 여부 (30% 비율로 이탈)
        """
        # 숫자 값은 random 모듈을 사용할 수도 있음
        age = random.randint(19, 60)
        sub_months = random.randint(1, 120)
        monthly_payment = round(random.uniform(10, 100), 2)
        usage_frequency = random.randint(1, 30)
        stft_score = random.randint(1, 5)
        churn = random.random() < 0.3 
        """
        
        data.append([name, address, age, sub_months, monthly_payment, usage_frequency, stft_score, churn])

    columns = ["NAME", "ADDRESS", "AGE", "SUB_MONTHS", "MONTHLY_PAYMENT", "USAGE_FEQ", "STFT", "CHURN"]

    return pd.DataFrame(data, columns=columns)
customer_df = generate_customer_data(20000)
customer_df.to_csv('customer_data.csv', index=False)

 

 

 

회귀 기반 추천 시스템

 

CatBoost (Categorical Boosting)

pip install catboost
import pandas as pd
from sklearn.model_selection import train_test_split
from catboost import Pool
from catboost import CatBoostRegressor

travel_df = pd.read_csv('./data/travel.csv')
# travel_df.shape  # (34572, 15)

# float 데이터 -> int 변환
travel_df[['AGE_GRP', 'TRAVEL_STYL_1', 'TRAVEL_STYL_2', 'TRAVEL_STYL_3', 'TRAVEL_STYL_4', 'TRAVEL_STYL_5', 'TRAVEL_STYL_6', 'TRAVEL_STYL_7', 'TRAVEL_STYL_8', 'TRAVEL_MOTIVE_1', 'TRAVEL_COMPANIONS_NUM']] \
    = travel_df[['AGE_GRP', 'TRAVEL_STYL_1', 'TRAVEL_STYL_2', 'TRAVEL_STYL_3', 'TRAVEL_STYL_4', 'TRAVEL_STYL_5', 'TRAVEL_STYL_6', 'TRAVEL_STYL_7', 'TRAVEL_STYL_8', 'TRAVEL_MOTIVE_1', 'TRAVEL_COMPANIONS_NUM']].astype(int)

X = travel_df.drop("DGSTFN", axis=1)
y = travel_df["DGSTFN"]

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

# 범주형 특성 배열
cat_features = [
    'GENDER', 'AGE_GRP', 
    'TRAVEL_STYL_1', 'TRAVEL_STYL_2', 'TRAVEL_STYL_3', 'TRAVEL_STYL_4', 
    'TRAVEL_STYL_5', 'TRAVEL_STYL_6', 'TRAVEL_STYL_7', 'TRAVEL_STYL_8', 
    'TRAVEL_MOTIVE_1', 'TRAVEL_COMPANIONS_NUM',
    'VISIT_AREA_NM', 'MVMN_NM'
]

# CatBoost 모델에서의 사용을 위한 Pool 객체 생성
train_pool = Pool(X_train, y_train, cat_features=cat_features)
test_pool = Pool(X_test, y_test, cat_features=cat_features)

# 모델 생성 및 학습
cb_reg = CatBoostRegressor(
    n_estimators=500,       # 반복횟수 (내부 예측기 개수)
    depth=5,                # 개별 트리의 최대 깊이
    learning_rate=0.03,     # 학습률
    loss_function='RMSE',   # 손실함수 (기본값)
    eval_metric='RMSE'      # 평가지표 (기본값) - 학습과 평가(검증) 동시 진행 지원
)

cb_reg.fit(train_pool, eval_set=test_pool, verbose=100)
# verbose=100 : 학습 100회당 한번씩 학습 및 검증 결과 출력

# 특성 중요도
col_importance = pd.DataFrame({
    'column': X_train.columns,
    'importance': cb_reg.feature_importances_
})

col_importance

 

  • 추천시스템 구축

1. 방문지 목록을 생성

2. 사용자 특성 입력

3. 가상 만족도 예측

4. 만족도가 높은 순으로 추천

# 1. 방문지 목록 생성
visit_areas = travel_df['VISIT_AREA_NM'].unique()
# visit_areas[:10]

# 2. 사용자 특성 입력
user_input = ['여', 60, 4, 4, 4, 4, 4, 4, 4, 4, 1, 2, '방문지', '자가용']
pred_results = []

# 3. 가상 만족도 예측
for area in visit_areas:
    user_input[-2] = area
    dgstfn_pred = cb_reg.predict(user_input)
    pred_results.append(dgstfn_pred)
pred_results[:10]


# 4. 만족도가 높은 순으로 출력 (추천)
result_df = pd.DataFrame({
    'VISIT_AREA_NM': visit_areas,
    'DGSTFN_PRED': pred_results
})

result_df.sort_values(by='DGSTFN_PRED', ascending=False).head(10)

 

 

 

SMOTE

 

pip install imbalanced-learn
import numpy as np
from imblearn.over_sampling import SMOTE
from sklearn.datasets import make_classification

X, y = make_classification(
    n_classes=2,
    weights=[0.9, 0.1],
    n_samples=1000,
    random_state=42
)

print(f'Before SMOTE: {np.bincount(y)}')  # Before SMOTE: [897 103] /샘플링 편향

smote = SMOTE(random_state=42)
X_resample, y_resample = smote.fit_resample(X, y)

print(f'After SMOTE: {np.bincount(y_resample)}')  # After SMOTE: [897 897]
print(X_resample.shape)  # (1794, 20)
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.ensemble import RandomForestClassifier

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0, stratify=y)

model = RandomForestClassifier(random_state=0)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))

"""
              precision    recall  f1-score   support

           0       0.97      1.00      0.98       224
           1       1.00      0.69      0.82        26

    accuracy                           0.97       250
   macro avg       0.98      0.85      0.90       250
weighted avg       0.97      0.97      0.97       250
"""

# 오버샘플링 적용 데이터
X_train, X_test, y_train, y_test = train_test_split(X_resample, y_resample, random_state=0)

model = RandomForestClassifier(random_state=0)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))

"""
# 전번적으로 성능이 향상됨
              precision    recall  f1-score   support

           0       0.96      0.96      0.96       224
           1       0.96      0.96      0.96       225

    accuracy                           0.96       449
   macro avg       0.96      0.96      0.96       449
weighted avg       0.96      0.96      0.96       449
"""