project-LinearRegression
Linear Regression project
- 한달동안 팀원들과 함께 한 프로젝트
- 홍성현님, 유호원님, 배준영님 한달동안 수고 많았습니다 :)
- 깃헙에서도 확인할 수 있습니다.
선형회귀분석에 사용한 코드| 정리
패키지 모듈
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19import re
import time
import pandas as pd
import seaborn as sns
import missingno as msno
import matplotlib as mpl
import matplotlib.pylab as plt
import statsmodels.api as sm
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
from scipy import stats
from sklearn import linear_model
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
# 컬럼 display 제한 상향
pd.set_option('display.max_columns', -1)
# 지수 표기법 해제
pd.options.display.float_format = '{:.7f}'.format로우데이터 임포트
1
raw_data = pd.read_csv("./csv_file/vehicles.csv") # ./ : 현재경로
결측치 확인 및 제거
1
2
3import missingno as msno
msno.matrix(start_df)
plt.show()결측치 수치화
1
2
3
4
5
6def columns_na_percentage(df,columns):
for column in columns:
percentage = round(100 -(len(df['{}'.format(column)].dropna()) / len(df)) * 100,2)
if percentage :
print("{} : ".format(column), percentage, "%", end='\n')
columns_na_percentage(start_df, start_df.columns)결측치 제거
1
2start_df = start_df.dropna(axis=0)
print("결측치 제거후 Data : ", start_df.shape)price 컬럼 내용 확인
1
raw_data['price'].describe()
1
raw_data['price'].sort_values(ascending=False)[:10]
중복 데이터 제거
1
2start_df = start_df.loc[start_df['vin'].drop_duplicates(keep='last').index]
print("중복 vin 삭제 Data : ", start_df.shape)VIN(차대번호) 크롤링 후 데이터 합치기
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22vin_crawling_data = pd.read_csv('./csv_file/final_vin_crawling.csv')
# secend_half_crawling_data = pd.read_csv('./vin_crawling_addtional.csv')
# vin_crawling_data = pd.concat([first_half_crawling_data, secend_half_crawling_data], axis=0)
# 대문자 변경
vin_crawling_data['vin'] = vin_crawling_data['vin'].str.upper()
# 중복제거
vin_crawling_data = vin_crawling_data.drop_duplicates('vin', keep='first')
# 대문자 변경
start_df['vin'] = start_df['vin'].str.upper()
# merge
merged_df = pd.merge(start_df, vin_crawling_data, on='vin')
# 중복 제거
merged_df = merged_df.drop_duplicates('vin', keep='first')
# 필요 없는 컬럼 제거
df = merged_df[merged_df.columns.difference(['id_y', 'id_x', 'Unnamed: 0', 'Unnamed: 0.1', 'og_vin'])]
print("크롤링 Data와 중복 vin 제거 data가 합쳐진 Data : ", df.shape)에러 컬럼 삭제
1
df['error'].value_counts()
1
2
3
4df=df[df.columns.difference(['error'])]
df = df.dropna(axis=0)
msno.matrix(df)
plt.show()데이터 필터링
1
2
3
4
5
6
7def get_sigma_data_by_price(num):
# price가 0인것들 제거
sigma = df[(df['price'] >= (df['average'] - (df['stdev']*num))) & (df['price'] <= (df['average'] + (df['stdev']*num)))]
return sigma
sigma_2_df = get_sigma_data_by_price(2)
print("2 sigma Data:", sigma_2_df.shape)year 항목 int로 형변환
1
sigma_2_df['year'] = sigma_2_df['year'].astype('int')
주행거리 아웃라이어 탐색
1
2print("주행거리가 잘못 입력된 차량 대수 : ",len(sigma_2_df[sigma_2_df['mileage'] != sigma_2_df['odometer']]))
sigma_2_df[sigma_2_df['mileage'] != sigma_2_df['odometer']].sort_values(ascending=False,by='odometer').head(2)1
2# 주행거리 아웃라이어 제거
sigma_2_df = sigma_2_df[sigma_2_df['mileage'] == sigma_2_df['odometer']]시각화: 박스플롯, 스캐터 플롯
1
2
3
4
5
6
7
8
9
10plt.figure(figsize=(16,15))
plt.subplot(311)
sns.boxplot(x="year", y="price", data = sigma_2_df)
plt.xticks(rotation=90)
plt.title('연도별 가격 분포')
plt.subplot(312)
sns.scatterplot(sigma_2_df['year'],sigma_2_df['odometer'])
plt.title("연도별 주행거리 분포")
plt.tight_layout()
plt.show()눈으로 확인 후 수정이 필요한 데이터
1
2# year 수정해 줘야 하는 데이터
sigma_2_df[sigma_2_df['year'] < 1980]1
sigma_2_df['year'] = sigma_2_df['year'].apply(lambda x : 2019 if x < 1980 else x)
주행거리가 높은 차량 확인
1
sigma_2_df[sigma_2_df['odometer'] > 500000]
1
2# 일 평균 주행거리 계산하여 확인
print("평균 주행거리 : ", round(533000.00 / (365 * 18), 2), "마일/day")높은 가격의 차량 확인
1
sigma_2_df[sigma_2_df['price'] > 75000].sort_values(by=['price'],ascending=False).head()
1
2# 낮은 금액의 차량 확인
sigma_2_df[sigma_2_df['price'] < 500].sort_values(by=['price'],ascending=False)1
2# 이상치 가격을 보이는 차량 제거
sigma_2_df = sigma_2_df[(sigma_2_df['vin'] != '1GCWGFBA7C1155304') & (sigma_2_df['vin'] != '1GCWGFBA8C1126880') & (sigma_2_df['vin'] != '1FMCU03115KA47874')]주행거리 0인 데이터 확인 및 제거
1
2
3print("판매자가 주행거리를 0으로 올려둔 경우 : ",len(sigma_2_df[sigma_2_df['odometer'] == 0]),"건")
sigma_2_df[sigma_2_df['odometer'] == 0].head(2)
sigma_2_df= sigma_2_df[sigma_2_df['odometer'] != 0]분석에 필요한 컬럼
1
2
3
4df = sigma_2_df[['price','year','odometer','drive','fuel','manufacturer','state','title_status','transmission','type','paint_color','cylinders']]
df = df.reset_index(drop=True)
msno.matrix(df)
plt.show()종속변수 : price
1
df.price.sort_values(ascending=False)[:20]
1
2
3
4
5
6
7
8
9plt.figure(figsize=(17,6))
plt.subplot(131)
sns.distplot(np.log(df['price']))
plt.subplot(132)
sns.boxplot(np.log(df['price']))
plt.subplot(133)
stats.probplot(np.log(df['price']), plot=plt)
plt.tight_layout()
plt.show()독립변수 확인
1
2
3
4
5plt.figure(figsize=(16,8))
sns.countplot(df['year'])
plt.title("차량 제조연도")
plt.xticks(rotation=90)
plt.show()주행거리 컬럼 확인
1
2
3
4
5
6
7
8plt.figure(figsize=(16,7))
plt.subplot(121)
sns.distplot(df['odometer'])
plt.subplot(122)
sns.distplot(df['odometer'])
plt.tight_layout()
plt.xlim(0,250000)
plt.show()카테고리 독립변수 확인
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43plt.figure(figsize=(15,20))
plt.subplot(621)
sns.countplot(
data = df,
y = "fuel",
palette='Set1',
)
plt.title("연료 종류 - fuel")
plt.subplot(622)
sns.countplot(
data = df,
y = "cylinders",
palette='Set1',
order = df['cylinders'].value_counts().index)
plt.title("실린더 갯수 - cylinders")
plt.subplot(623)
sns.countplot(
data = df,
y = "transmission",
palette='Set1'
)
plt.title("변속기 - transmission")
plt.subplot(624)
sns.countplot(
data = df,
y = "drive",
palette='Set1'
)
plt.title("구동 방식 - drive")
plt.subplot(625)
sns.countplot(
data = df,
y = "title_status",
palette='Set1'
)
plt.title("차량 상태 - title_status")
plt.tight_layout()
plt.show()1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37plt.figure(figsize=(18,10))
plt.subplot(221)
sns.countplot(
data = df,
y = "type",
palette='Set1',
order = df['type'].value_counts().index
)
plt.title("차량 종류 - type")
plt.subplot(222)
sns.countplot(
data = df,
y = "paint_color",
palette='Set1',
order = df['paint_color'].value_counts().index)
plt.title("차량 색깔 - paint_color")
plt.subplot(223)
sns.countplot(
data = df,
y = "manufacturer",
palette='Set1',
order = df['manufacturer'].value_counts().iloc[:7].index,)
plt.title("제조사 - manufacturer")
plt.subplot(224)
sns.countplot(
data = df,
y = "state",
palette='Set1',
order = df['state'].value_counts().iloc[:7].index
)
plt.title("주 - state")
plt.tight_layout()
plt.show()실수형 변수 상관분석
1
2sns.pairplot(df[['price','odometer','year']],size=3)
plt.show()Partial Regression Plot(부분회귀분석)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16fig = plt.figure(figsize=(20,8))
ax1 = fig.add_subplot(1,2,1)
ax2 = fig.add_subplot(1,2,2)
others = list(set(df.columns).difference(set(["price", "odometer"])))
p, resids = sm.graphics.plot_partregress(
"price", "odometer", others, data=df, obs_labels=False, ret_coords=True, ax = ax1
)
others2 = list(set(df.columns).difference(set(["price", "year"])))
p, resids = sm.graphics.plot_partregress(
"price", "year", others2, data=df, obs_labels=False, ret_coords=True, ax = ax2
)
plt.tight_layout()
plt.show()모델링, 데이터 분할
1
2
3
4
5
6
7
8# 빈도수에 따른 필터링
def value_counts_filter(num, columns):
for column in columns:
result = df[column].value_counts()[df[column].value_counts().sort_values() < num]
if len(result) !=0:
print(result.values.sum())
print(result, end="\n\n===========\n\n")
value_counts_filter(10, df.columns.difference(['price', 'odometer', 'year']))2개 초과 10개 미만 데이터 인덱스 확인
1
2
3
4
5
6
7
8
9
10
11
12def check_under_10_index(start_num, end_num, columns, df):
test = {}
test2 = []
for column in columns:
len_under_10 = len(df[column].value_counts()[(df[column].value_counts() < end_num) & (df[column].value_counts() > start_num)])
if len_under_10:
for i in range(len_under_10):
index = df[df[column] == df[column].value_counts()[(df[column].value_counts() < end_num) & (df[column].value_counts() > start_num)].index[i]].index.values
value = df[column].value_counts()[(df[column].value_counts() < end_num) & (df[column].value_counts() > start_num)].index[i]
test[value] = index
test2.append(test)
return test21
2index_df = pd.DataFrame(check_under_10_index(2,10, ['cylinders', 'manufacturer', 'title_status', 'type'], df))
index_df데이터 분할에 사용할 인덱스 분류
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21for_test_data = []
for_train_data_train = []
for_train_data_test = []
for column in index_df.columns:
start = list(index_df[column][0])
random.shuffle(start)
if len(start) > 4:
m = [start[i:i + 3] for i in range(0, len(start), 3)]
for_test_data.append(m[0])
for_train_data_train.append(m[1])
for_train_data_test.append(m[2])
elif len(start) == 4:
m = [start[:2], start[2:3], start[3:]]
for_test_data.append(m[0])
for_train_data_train.append(m[1])
for_train_data_test.append(m[2])
else :
m = [[i] for i in start]
for_test_data.append(m[0])
for_train_data_train.append(m[1])
for_train_data_test.append(m[2])10개 미만 데이터 우선 삭제
1
2
3
4
5
6
7def delete_under_ten(df):
for column in df.columns.difference(['id', 'price', 'odometer', 'year']):
values = [value for value in df[column].value_counts()[df[column].value_counts() < 10].keys()]
if values:
for value in values:
df = df[df[column] != value]
return df1
df_deleted_under_ten = delete_under_ten(df)
2개 초과 10개 미만 데이터 균등 분배
1
2
3
4
5
6
7train_data, test_data = train_test_split(df_deleted_under_ten, test_size = .20, random_state = 7)
train_data = pd.concat([train_data, df.iloc[
[element for array in for_train_data_train for element in array] + [element for array in for_train_data_test for element in array]
]], axis=0)
test_data = pd.concat([test_data,df.iloc[
[element for array in for_test_data for element in array]]])
train_data.shape, test_data.shape데이터 확인
1
2msno.matrix(train_data)
plt.show()선형회귀분석(LinearRegression)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26X = train_data[train_data.columns.difference(['id_x', 'model', 'vin', 'price'])]
Y = np.log(train_data['price'])
X = pd.get_dummies(data=X, drop_first=True)
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = .20, random_state = 0)
test_df = pd.concat([Y_train, X_train], axis=1)
model = linear_model.LinearRegression()
result = model.fit(X_train, Y_train)
predicted = result.predict(X_test)
r2 = r2_score(Y_test,predicted)
print('Test R2 score : ', r2)
plt.scatter(Y_test,predicted)
pred_tr = result.predict(X_train)
pred_test = result.predict(X_test)
rmse_tr = (np.sqrt(mean_squared_error(Y_train, pred_tr)))
rmse_test = (np.sqrt((Y_test, pred_test)))
# cv = KFold(10, shuffle=True, random_state=0)
model_0_cross_val_score = cross_val_score(model, X, Y, scoring=None, cv=10)
print('RMSE of Train Data : ', rmse_tr)
print('RMSE of Test Data : ', rmse_test)
print('K-fold : ', model_0_cross_val_score)Model 1
1
2
3from used_car_regression import UsedCarRegression
usedcar = UsedCarRegression(df)
df.shape1
2# 모델1 full rank
model1_formula = "np.log(price) ~ scale(odometer) + scale(year) +C(manufacturer)+C(cylinders)+C(drive)+C(fuel)+C(state)+C(title_status)+C(transmission)+C(type)+C(paint_color) + 0"1
2
3
4
5
6
7
8
9
10
11
12def summary_minimize_df(result, model_num):
df = pd.DataFrame([{
"R-squared": result.rsquared,
"Adj. R-squared": result.rsquared_adj,
"F-statistic": result.fvalue,
"Prob (F-statistic)": result.f_pvalue,
"Log-Likelihood": result.llf,
"AIC": result.aic,
"BIC": result.bic
}]).T
df.columns = [model_num]
return df모델 학습
1
2result, train_data, test_data, train_ls, test_ls = usedcar.model_fit(model1_formula)
pred = result.predict(test_data)1
2model1_min_df = summary_minimize_df(result, "Model 1")
model1_min_df1
2# R2 score 확인
r2_score(np.log(test_data['price']), pred)교차 검증(cross validation)
1
2cross_validation_model1 = usedcar.cross_validation(model1_formula)
cross_validation_model1ANOVA 독립변수 확인
1
2anova = sm.stats.anova_lm(result, typ=2)
anova.sort_values(by=['F'], ascending=False)Model 2 : 실수형 * 범주형 interaction
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29Numbers = ['scale(odometer)', 'scale(year)']
X = ['fuel', 'title_status', 'transmission', 'drive', 'type', 'paint_color',"cylinders", "manufacturer"]
combination_score = []
combination_name = []
for number in Numbers:
feature = number
for i in X:
interaction = feature + ":" + i
kf = KFold(5, shuffle=True, random_state=0)
mode2_cross_val_score = []
for X_train_index, X_test_index in kf.split(train_data):
X_train= train_data.iloc[X_train_index]
X_test = train_data.iloc[X_test_index]
X_train = pd.concat([X_train, train_data[train_data.index.isin([element for array in for_train_data_train for element in array])]], axis=0)
X_test = pd.concat([X_test, train_data[train_data.index.isin([element for array in for_train_data_test for element in array])]], axis=0)
model1 = sm.OLS.from_formula("np.log(price) ~ scale(odometer) + scale(year) +{}+{}".format("+".join(X_train.columns.difference(["price",'odometer','year'])), interaction), X_train)
result = model1.fit()
pred = result.predict(X_test)
R2 = r2_score(np.log(X_test.price), pred)
n = train_data.shape[0]
p = len(pd.get_dummies(train_data).columns)
Adjr2 = 1-(1-R2)*(n-1)/(n-p-1)
mode2_cross_val_score.append(Adjr2)
combination_score.append(mode2_cross_val_score)
combination_name.append(interaction)1
2
3
4fig1, ax1 = plt.subplots(figsize=(14, 6))
ax1.boxplot(combination_score)
ax1.set_xticklabels(combination_name, rotation=90)
plt.show()Model 2 formula
1
2model2_formula = "np.log(price) ~ scale(odometer) + scale(year) + C(manufacturer) + C(cylinders) + C(drive) + C(fuel) + C(state) + C(title_status) + C(transmission) + C(type) + C(paint_color)\
+ scale(odometer):C(cylinders) + scale(odometer):C(type) + scale(year):C(manufacturer) + scale(year):C(type) + scale(odometer):C(fuel)"모델학습
1
2
3
4result, train_data, test_data, train_ls, test_ls = usedcar.model_fit(model2_formula)
pred = result.predict(test_data)
model2_min_df = summary_minimize_df(result, 'Model 2')
pd.concat([model1_min_df,model2_min_df], axis=1)1
2# R2 score
r2_score(np.log(test_data['price']), pred)CV
1
2cross_validation_model2 = usedcar.cross_validation(model2_formula)
cross_validation_model2Model 3 : 범주형 * 범주형 interaction
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29X = ['fuel', 'title_status', 'transmission', 'drive', 'type', 'paint_color', 'cylinders', 'manufacturer']
combination_score = []
combination_name = []
for count in range(len(X)):
feature = X[-1]
X.pop()
for i in X:
interaction = feature + ":" + i
kf = KFold(5, shuffle=True, random_state=0)
model_cross_val_score = []
for X_train_index, X_test_index in kf.split(train_data):
X_train= train_data.iloc[X_train_index]
X_test = train_data.iloc[X_test_index]
X_train = pd.concat([X_train, train_data[train_data.index.isin([element for array in for_train_data_train for element in array])]], axis=0)
X_test = pd.concat([X_test, train_data[train_data.index.isin([element for array in for_train_data_test for element in array])]], axis=0)
model1 = sm.OLS.from_formula("np.log(price) ~ scale(odometer) + scale(year) + {} + {}".format("+".join(X_train.columns.difference(['price', 'odometer', 'year', feature, i])), interaction), X_train)
result = model1.fit()
pred = result.predict(X_test)
R2 = r2_score(np.log(X_test.price),pred)
n = train_data.shape[0]
p = len(pd.get_dummies(train_data).columns)
Adjr2 = 1-(1-R2)*(n-1)/(n-p-1)
model_cross_val_score.append(Adjr2)
combination_score.append(model_cross_val_score)
combination_name.append(interaction)1
2
3
4fig1, ax1 = plt.subplots(figsize=(14, 6))
ax1.boxplot(combination_score)
ax1.set_xticklabels(combination_name, rotation=90)
plt.show()1
model3_formula = "np.log(price) ~ scale(odometer) + scale(year) + C(manufacturer) + C(cylinders) + C(drive) + C(fuel) + C(state) + C(title_status) + C(transmission) + C(type) + C(paint_color) + scale(odometer):C(cylinders) + scale(odometer):C(type) + scale(year):C(manufacturer) + scale(year):C(type) + scale(odometer):C(fuel) + C(manufacturer):C(cylinders) + C(type):C(drive)"
모델학습
1
2
3
4result, train_data, test_data, train_ls, test_ls = usedcar.model_fit(model3_formula)
pred = result.predict(test_data)
model3_min_df = summary_minimize_df(result, 'Model 3')
pd.concat([model1_min_df, model2_min_df, model3_min_df], axis=1)1
2# R2 score
r2_score(np.log(test_data['price']), pred)CV
1
2cross_validation_model3 = usedcar.cross_validation(model3_formula)
cross_validation_model3Model 4 : year와 odometer 다항식 추가
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16# Partial Regression Plot을 확인하면 약간 2차방정식 모형처럼 보여서 진행
fig = plt.figure(figsize=(20,8))
ax1 = fig.add_subplot(1,2,1)
ax2 = fig.add_subplot(1,2,2)
others = list(set(df.columns).difference(set(["price", "odometer"])))
p, resids = sm.graphics.plot_partregress(
"price", "odometer", others, data=df, obs_labels=False, ret_coords=True, ax = ax1
)
others2 = list(set(df.columns).difference(set(["price", "year"])))
p, resids = sm.graphics.plot_partregress(
"price", "year", others2, data=df, obs_labels=False, ret_coords=True, ax = ax2
)
plt.tight_layout()
plt.show()year에 제곱항을 더했을 때
1
2
3
4
5
6
7
8
9
10score = []
formula = "np.log(price) ~ scale(odometer) + scale(year) + C(manufacturer) + C(cylinders) + C(drive) + C(fuel) + C(state) + C(title_status) + C(transmission) + C(type) + C(paint_color) + scale(odometer):C(cylinders) + scale(odometer):C(type) + scale(year):C(manufacturer) + scale(year):C(type) + scale(odometer):C(fuel) + C(manufacturer):C(cylinders) + C(type):C(drive)"
for i in range(1, 5):
if i == 1:
score.append(usedcar.cross_validation(formula))
else:
formula += " + scale(I(year**{}))".format(i)
score.append(usedcar.cross_validation(formula))1
2
3
4
5name = ['1차항', '2차항', '3차항', '4차항']
fig1, ax1 = plt.subplots(figsize=(14, 6))
ax1.boxplot(score[:4])
ax1.set_xticklabels(name)
plt.show()odometer에 제곱항을 더했을 때
1
2
3
4
5
6
7
8
9
10score = []
formula = "np.log(price) ~ scale(odometer) + scale(year) + C(manufacturer) + C(cylinders) + C(drive) + C(fuel) + C(state) + C(title_status) + C(transmission) + C(type) + C(paint_color)"
for i in range(1,5):
if i == 1:
score.append(usedcar.cross_validation(formula))
else:
formula += " + scale(I(odometer**{}))".format(i)
score.append(usedcar.cross_validation(formula))1
2
3
4
5name = ['1차항', '2차항', '3차항', '4차항']
fig1, ax1 = plt.subplots(figsize=(14, 6))
ax1.boxplot(score)
ax1.set_xticklabels(name)
plt.show()year와 odometer 3차항까지 추가한 모델 비교
1
2
3model4_1_formula = "np.log(price) ~ scale(odometer) + scale(year) + C(manufacturer) + C(cylinders) + C(drive) + C(fuel) + C(state) + C(title_status) + C(transmission) + C(type) + C(paint_color) + scale(odometer):C(cylinders) + scale(odometer):C(type) + scale(year):C(manufacturer) + scale(year):C(type) + scale(odometer):C(fuel) + C(manufacturer):C(cylinders) + C(type):C(drive) + scale(I(year**2)) + scale(I(year**3)) + scale(I(odometer**2)) + scale(I(odometer**3)) "
model4_2_formula = "np.log(price) ~ scale(odometer) + scale(year) + C(manufacturer) + C(cylinders) + C(drive) + C(fuel) + C(state) + C(title_status) + C(transmission) + C(type) + C(paint_color) + scale(odometer):C(cylinders) + scale(odometer):C(type) + scale(year):C(manufacturer) + scale(year):C(type) + scale(odometer):C(fuel) + C(manufacturer):C(cylinders) + C(type):C(drive) + scale(I(year**2)) + scale(I(year**3)) "
model4_3_formula = "np.log(price) ~ scale(odometer) + scale(year) + C(manufacturer) + C(cylinders) + C(drive) + C(fuel) + C(state) + C(title_status) + C(transmission) + C(type) + C(paint_color) + scale(odometer):C(cylinders) + scale(odometer):C(type) + scale(year):C(manufacturer) + scale(year):C(type) + scale(odometer):C(fuel) + C(manufacturer):C(cylinders) + C(type):C(drive) + scale(I(odometer**2)) + scale(I(odometer**3)) "검증결과 비교
1
2
3
4
5
6
7
8
9
10
11
12
13
14cross_validation_model4_1 = usedcar.cross_validation(model4_1_formula)
cross_validation_model4_2 = usedcar.cross_validation(model4_2_formula)
cross_validation_model4_3 = usedcar.cross_validation(model4_3_formula)
fig, ax = plt.subplots(figsize=(15, 8))
ax.boxplot(
[
cross_validation_model4_1, cross_validation_model4_2, cross_validation_model4_3
],
sym="b*",
labels=['Model 4_1(year, odometer 다차항)', 'Model 4_2(year 다차항)', 'Model 4_3(odometer 다차항)'],
)
plt.title('모델별 K-fold 검증 비교')
plt.show()Model 4 결론 : year에 3차항까지 추가
1
model4_formula = "np.log(price) ~ scale(odometer) + scale(year) + C(manufacturer) + C(cylinders) + C(drive) + C(fuel) + C(state) + C(title_status) + C(transmission) + C(type) + C(paint_color) + scale(odometer):C(cylinders) + scale(odometer):C(type) + scale(year):C(manufacturer) + scale(year):C(type) + scale(odometer):C(fuel) + C(manufacturer):C(cylinders) + C(type):C(drive) + scale(I(year**2)) + scale(I(year**3)) "
모델학습
1
2
3
4result, train_data, test_data, train_ls, test_ls = usedcar.model_fit(model4_formula)
pred = result.predict(test_data)
model4_min_df = summary_minimize_df(result, 'Model 4')
pd.concat([model1_min_df, model2_min_df, model3_min_df, model4_min_df], axis=1)1
2# R2 score 확인
r2_score(np.log(test_data['price']),pred)CV
1
2cross_validation_model4 = usedcar.cross_validation(model4_formula)
cross_validation_model4모델별 성능 비교
1
2
3
4
5
6
7
8
9
10
11
12fig, ax = plt.subplots(figsize=(15, 8))
ax.boxplot(
[
cross_validation_model1, cross_validation_model2,
cross_validation_model3, cross_validation_model4,
],
sym="b*",
labels=['Model 1', 'Model 2', 'Model 3', 'Model 4'],
)
plt.title('모델별 K-fold 검증 비교')
plt.show()Model 5 : 중고차 워런티 제도로 odometer 분포 탐색
1
2
3
4
5
6
7
8plt.figure(figsize=(16,7))
plt.subplot(121)
sns.distplot(df['odometer'])
plt.subplot(122)
sns.distplot(df['odometer'])
plt.tight_layout()
plt.xlim(0,150000)
plt.show()6만 마일 기준, 이하 = 0, 초과 = 1
1
2df['odometer_under_100000'] = df['odometer'].apply(lambda x: 0 if x >= 60000 else 1)
df['odometer_under_100000'].unique()1
usedcar = UsedCarRegression(df)
1
model5_formula = "np.log(price) ~ scale(odometer) + scale(year) + C(manufacturer) + C(cylinders) + C(drive) + C(fuel) + C(state) + C(title_status) + C(transmission) + C(type) + C(paint_color) + scale(odometer):C(cylinders) + scale(odometer):C(type) + scale(year):C(manufacturer) + scale(year):C(type) + scale(odometer):C(fuel) + C(manufacturer):C(cylinders) + C(type):C(drive) + scale(I(year**2)) + scale(I(year**3)) + C(odometer_under_100000)"
모델학습
1
2
3
4result, train_data, test_data, train_ls, test_ls = usedcar.model_fit(model5_formula)
pred = result.predict(test_data)
model5_min_df = summary_minimize_df(result, 'Model 5')
pd.concat([model1_min_df,model2_min_df,model3_min_df,model4_min_df,model5_min_df], axis=1)1
2# R2 score 확인
r2_score(np.log(test_data['price']), pred)1
2
3# CV
cross_validation_model5 = usedcar.cross_validation(model5_formula)
cross_validation_model51
2
3
4
5
6
7
8
9
10
11
12
13
14# 모델별 성능비교
fig, ax = plt.subplots(figsize=(15, 8))
ax.boxplot(
[
cross_validation_model1, cross_validation_model2,
cross_validation_model3, cross_validation_model4,
cross_validation_model5
],
sym="b*",
labels=['Model 1', 'Model 2', 'Model 3', 'Model 4', 'Model 5'],
)
plt.title('모델별 K-fold 검증 비교')
plt.show()Model 6 : 성능이 가장 좋은 모델4에 Ridge, Lasso, Elastic Net 적용
주의 : 시간이 오래걸림, 기억으론 약 4시간 30분 정도 소요됨
1 | df = df.drop('odometer_under_100000',axis=1) |
1 | usedcar = UsedCarRegression(df) |
1 | # Ridge |
1 | # Elastic_net |
1 | elastic_net_df = pd.read_csv('./csv_file/elastic.csv', index_col=[0]) |
1 | lasso_df['Lasso'] = lasso_df['Lasso'].str.extract(r'(0[.][0-9]*)').astype('float') |
1 | plt.figure(figsize=(16,8)) |
1 | model6_formula = "np.log(price) ~ scale(odometer) + scale(year) + C(manufacturer) + C(cylinders) + C(drive) + C(fuel) + C(state) + C(title_status) + C(transmission) + C(type) + C(paint_color) + scale(odometer):C(cylinders) + scale(odometer):C(type) + scale(year):C(manufacturer) + scale(year):C(type) + scale(odometer):C(fuel) + C(manufacturer):C(cylinders) + C(type):C(drive) + scale(I(year**2)) + scale(I(year**3))" |
1 | usedcar = UsedCarRegression(df) |
1 | result, train_data, test_data, train_ls, test_ls = usedcar.model_fit(model6_formula) |
1 | score, result, cross_validation_model6 = usedcar.regularized_method( |
- K-Fold 검증 성능 비교
1
2
3
4
5
6
7
8
9
10
11
12
13# odometer 6만
fig, ax = plt.subplots(figsize=(15, 8))
ax.boxplot(
[
cross_validation_model1, cross_validation_model2,
cross_validation_model3, cross_validation_model4,
cross_validation_model5, cross_validation_model6
],
sym="b*",
labels=['Model 1', 'Model 2', 'Model 3', 'Model 4', 'Model 5', 'Model 6'],
)
plt.title('모델별 K-fold 검증 비교')
plt.show() - 모델별 Test 데이터로 예측한 R-quare
1
model1_formula = "np.log(price) ~ scale(odometer) + scale(year) + C(manufacturer) + C(cylinders) + C(drive) + C(fuel) + C(state) + C(title_status) + C(transmission) + C(type) + C(paint_color)"
1
2
3
4result = usedcar.model_fit(model1_formula)
prediction = result[0].predict(test_data[test_data.columns.difference(['price'])])
model_1_test_r2 = r2_score(np.log(test_data['price']), prediction)
print("Model 1 R-sqaure : ", model_1_test_r2)1
model2_formula = "np.log(price) ~ scale(odometer) + scale(year) + C(manufacturer) + C(cylinders) + C(drive) + C(fuel) + C(state) + C(title_status) + C(transmission) + C(type) + C(paint_color) + scale(odometer):C(cylinders) + scale(odometer):C(type) + scale(year):C(manufacturer)"
1
2
3
4result= usedcar.model_fit(model2_formula)
prediction = result[0].predict(test_data[test_data.columns.difference(['price'])])
model_2_test_r2 = r2_score(np.log(test_data['price']), prediction)
print("Model 2 R-sqaure : ", model_2_test_r2)1
model3_formula = "np.log(price) ~ scale(odometer) + scale(year) + C(manufacturer) + C(cylinders) + C(drive) + C(fuel) + C(state) + C(title_status) + C(transmission) + C(type) + C(paint_color) + scale(odometer):C(cylinders) + scale(odometer):C(type) + scale(year):C(manufacturer) + C(manufacturer):C(cylinders) + C(type):C(drive)"
1
2
3
4result= usedcar.model_fit(model3_formula)
prediction = result[0].predict(test_data[test_data.columns.difference(['price'])])
model_3_test_r2 = r2_score(np.log(test_data['price']), prediction)
print("Model 3 R-sqaure : ", model_3_test_r2)1
model4_formula = "np.log(price) ~ scale(odometer) + scale(year) + C(manufacturer) + C(cylinders) + C(drive) + C(fuel) + C(state) + C(title_status) + C(transmission) + C(type) + C(paint_color) + scale(odometer):C(cylinders) + scale(odometer):C(type) + scale(year):C(manufacturer) + scale(year):C(type) + scale(odometer):C(fuel) + C(manufacturer):C(cylinders) + C(type):C(drive) + scale(I(year**2)) + scale(I(year**3)) "
1
2
3
4
5result = usedcar.model_fit(model4_formula)
prediction = result[0].predict(test_data[test_data.columns.difference(
['price'])])
model_4_test_r2 = r2_score(np.log(test_data['price']), prediction)
print("Model 4 R-sqaure : ", model_4_test_r2)1
model5_formula = "np.log(price) ~ scale(odometer) + scale(year) + C(manufacturer) + C(cylinders) + C(drive) + C(fuel) + C(state) + C(title_status) + C(transmission) + C(type) + C(paint_color) + scale(odometer):C(cylinders) + scale(odometer):C(type)+scale(year):C(manufacturer) + scale(year):C(type) + scale(odometer):C(fuel) + C(manufacturer):C(cylinders) + C(type):C(drive) + scale(I(year**2)) + scale(I(year**3)) + C(odometer_under_100000)"
1
df['odometer_under_100000'] = df['odometer'].apply(lambda x: 0 if x >= 60000 else 1)
1
usedcar = UsedCarRegression(df)
1
test_data['odometer_under_100000'] = test_data['odometer'].apply(lambda x: 0 if x >= 60000 else 1)
1
2
3
4
5result = usedcar.model_fit(model5_formula)
prediction = result[0].predict(test_data[test_data.columns.difference(
['price'])])
model_5_test_r2 = r2_score(np.log(test_data['price']), prediction)
print("Model 5 R-sqaure : ", model_5_test_r2)1
2test_data = test_data.drop('odometer_under_100000', axis=1)
df = df.drop('odometer_under_100000', axis=1)1
model6_formula = "np.log(price) ~ scale(odometer) + scale(year) + C(manufacturer) + C(cylinders) + C(drive) + C(fuel) + C(state) + C(title_status) + C(transmission) + C(type) + C(paint_color) + scale(odometer):C(cylinders) + scale(odometer):C(type) + scale(year):C(manufacturer) + scale(year):C(type) + scale(odometer):C(fuel) + C(manufacturer):C(cylinders) + C(type):C(drive) + scale(I(year**2)) + scale(I(year**3))"
1
result, train_data, test_data, train_ls, test_ls = usedcar.model_fit(model6_formula)
1
2score, result, cross_validation_model6 = usedcar.regularized_method(
model6_formula, cv=10, alpha=0.00001, L1_wt=0)1
2
3
4prediction = result.predict(test_data[test_data.columns.difference(
['price'])])
model_6_test_r2 = r2_score(np.log(test_data['price']), prediction)
print("Model 6 R-sqaure : ", model_6_test_r2)1
2
3
4
5
6
7
8final_result = {
"model 1": model_1_test_r2,
"model 2": model_2_test_r2,
"model 3": model_3_test_r2,
"model 4": model_4_test_r2,
"model 5": model_5_test_r2,
"model 6": model_6_test_r2,
}1
2
3
4
5
6
7
8
9
10pd.DataFrame(final_result,index=[0]).plot.bar(figsize=(20,10))
plt.ylim(0.8,0.9)
plt.title('모델별 Test 데이터 R2 score')
plt.tick_params(
axis='x', # changes apply to the x-axis
which='both', # both major and minor ticks are affected
bottom=False, # ticks along the bottom Edge are off
top=False, # ticks along the top Edge are off
labelbottom=False) # labels along the bottom Edge are off
plt.show() - 가설검증 1
주행거리 5만킬로 미터 이하일 때 파는게 좋을것이다.
1 | sigma_2_df['vehicle'].value_counts()[:5] |
1 | # 거래가 가장 많은 차종으로 선택 |
1 | def hypothesis_proof(df,result): |
1 | hypothesis_1, data = hypothesis_proof(data, result) |
1 | plt.figure(figsize=(20,9)) |
가설 1 결론 : 3만 마일 (약 48000km)부터 가격이 급격하게 떨어지는걸 볼수 있다.
- 가설검증 2
지역별 가격차이가 있을 것이다
1 | anova = sm.stats.anova_lm(result, typ=2) |
1 | data = sigma_2_df[sigma_2_df['vehicle'] == "2012 Ford F-150 FX4"] |
- 회고
Keep
- 분석 목적
- 가설 설정
- EDA 및 전처리
Problem
- 자동차 보증수리 여부에 대한 데이터의 부재로, Model 5의 아이디어를 좀더 발전 시키지 못한 점
- 머신러닝 때 배운 페이스북 예측 모델 넣어서 예측해보기
Try
- 주기적으로 업데이트되는 데이터를 활용한 예측 진행