# !pip install --user plotly_express


# import libraries
import pandas as pd 
import numpy as np
import time
import plotly_express as px
import plotly.graph_objects as go
from sklearn.metrics import mean_squared_error as mse
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split, cross_val_score, cross_val_predict
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import SimpleImputer, IterativeImputer, KNNImputer
from sklearn.preprocessing import OrdinalEncoder, LabelEncoder
from sklearn.pipeline import Pipeline
import lightgbm as lgb
from catboost import CatBoostRegressor, Pool 
import xgboost as xgb
# show graphs in html
import plotly.io as pio
pio.renderers.default = "plotly_mimetype+notebook"


# read dataset
df = pd.read_csv('datasets/car_data.csv', parse_dates=True)


# look at data
df.head()


# summary of columns
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 354369 entries, 0 to 354368
Data columns (total 16 columns):
 #   Column             Non-Null Count   Dtype 
---  ------             --------------   ----- 
 0   DateCrawled        354369 non-null  object
 1   Price              354369 non-null  int64 
 2   VehicleType        316879 non-null  object
 3   RegistrationYear   354369 non-null  int64 
 4   Gearbox            334536 non-null  object
 5   Power              354369 non-null  int64 
 6   Model              334664 non-null  object
 7   Mileage            354369 non-null  int64 
 8   RegistrationMonth  354369 non-null  int64 
 9   FuelType           321474 non-null  object
 10  Brand              354369 non-null  object
 11  NotRepaired        283215 non-null  object
 12  DateCreated        354369 non-null  object
 13  NumberOfPictures   354369 non-null  int64 
 14  PostalCode         354369 non-null  int64 
 15  LastSeen           354369 non-null  object
dtypes: int64(7), object(9)
memory usage: 43.3+ MB


# loking for missing values
df.isna().sum()

DateCrawled              0
Price                    0
VehicleType          37490
RegistrationYear         0
Gearbox              19833
Power                    0
Model                19705
Mileage                  0
RegistrationMonth        0
FuelType             32895
Brand                    0
NotRepaired          71154
DateCreated              0
NumberOfPictures         0
PostalCode               0
LastSeen                 0
dtype: int64


# change column names
df.columns =['date_crawled', 'price', 'vehicle_type', 'registration_year', 'gearbox', 'power', 'model', 'mileage', 'registration_month', 'fuel_type', 'brand', 'not_repaired', 'date_created', 'number_of_pictures', 'postal_code', 'last_seen']


# Percentage missing 
df.not_repaired.isna().sum() / len(df) * 100

20.079070121822166


# Percentage missing 
df.vehicle_type.isna().sum() / len(df) * 100

10.57936783409384


# Percentage missing 
df.gearbox.isna().sum() / len(df) * 100

5.596708515699737


# Percentage missing 
df.fuel_type.isna().sum() / len(df) * 100

9.282696849893755


# changing datatypes to datetimes
df.date_crawled = pd.to_datetime(df.date_crawled)


# check proper implementation
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 354369 entries, 0 to 354368
Data columns (total 16 columns):
 #   Column              Non-Null Count   Dtype         
---  ------              --------------   -----         
 0   date_crawled        354369 non-null  datetime64[ns]
 1   price               354369 non-null  int64         
 2   vehicle_type        316879 non-null  object        
 3   registration_year   354369 non-null  int64         
 4   gearbox             334536 non-null  object        
 5   power               354369 non-null  int64         
 6   model               334664 non-null  object        
 7   mileage             354369 non-null  int64         
 8   registration_month  354369 non-null  int64         
 9   fuel_type           321474 non-null  object        
 10  brand               354369 non-null  object        
 11  not_repaired        283215 non-null  object        
 12  date_created        354369 non-null  object        
 13  number_of_pictures  354369 non-null  int64         
 14  postal_code         354369 non-null  int64         
 15  last_seen           354369 non-null  object        
dtypes: datetime64[ns](1), int64(7), object(8)
memory usage: 43.3+ MB


# extracting data from datetime columns
df['year_crawled'] = df.date_crawled.dt.year
df['month_crawled'] = df.date_crawled.dt.month
df['day_crawled'] = df.date_crawled.dt.day
df['time_crawled'] = df.date_crawled.dt.hour


# drop unnecessary columns
df.drop(['date_crawled', 'date_created', 'last_seen'], axis=1, inplace=True)


# visual of data
df.head()


# look at the columns names
df.columns

Index(['price', 'vehicle_type', 'registration_year', 'gearbox', 'power',
       'model', 'mileage', 'registration_month', 'fuel_type', 'brand',
       'not_repaired', 'number_of_pictures', 'postal_code', 'year_crawled',
       'month_crawled', 'day_crawled', 'time_crawled'],
      dtype='object')


# correlation of categories
df.corr()


# summary statistics of data
df.describe()


# number of picture values
df.number_of_pictures.value_counts(dropna=False)

0    354369
Name: number_of_pictures, dtype: int64


# drop unnecessary column
df.drop('number_of_pictures', axis=1, inplace=True)


# dropping rows where vehicle type and model are missing
mask = (df['vehicle_type'].isnull() & df['model'].isnull())
df.drop(df[mask].index, inplace=True)


# Check for proper implementation
(df['vehicle_type'].isnull() & df['model'].isnull()).sum()

0


# petrol and gasoline are the same
df.fuel_type = df.fuel_type.replace('gasoline', 'petrol')


# number of cars with registration years below 1960
df.query("registration_year < 1960").head()


# filtering outliers of dataset
df = df.query("(registration_year > 1960) and (registration_year <= 2019)")


# looking for outliers
df.registration_year.value_counts()

2000    23177
1999    22717
2005    21566
2001    20115
2006    19830
2003    19754
2004    19598
2002    19087
1998    17880
2007    17142
2008    15358
1997    14649
2009    14460
1996    10823
2010    10723
2011     9884
1995     9170
2017     9015
2016     8318
2012     7166
1994     4840
2013     4114
1993     3434
2018     3386
1992     3001
1991     2761
2014     2519
1990     2501
1989     1317
2015     1192
1988      941
1985      684
1987      666
1986      537
1980      482
1983      430
1984      399
1982      316
1978      285
1979      283
1970      272
1981      263
1972      213
1976      175
1977      173
1973      173
1974      168
1971      167
1975      143
1969      118
1966      111
1968      108
1967       99
1965       82
1964       64
1963       57
1962       46
1961       41
2019       20
Name: registration_year, dtype: int64


# boxplot of registration
fig1 = px.box(df.registration_year)
fig1


# fig1.write_html('figures/fig1.html')


# values of power
df.power.value_counts()

0       36270
75      23721
60      15687
150     14443
101     13158
        ...  
2789        1
519         1
2461        1
6006        1
1241        1
Name: power, Length: 706, dtype: int64


# looking at outliers of power
fig2 = px.box(df.power)
fig2


# fig2.write_html('figures/fig2.html')


# filtering outliers of dataset
df = df.query("(power > 20) and (power < 500)")


# filter for categories
categories = ['price', 'vehicle_type', 'registration_year', 'gearbox', 'power', 'mileage', 'registration_month', 'fuel_type', 'brand',
       'not_repaired']


# loop for displaying histograms of categories 
for i in categories:
    px.histogram(df[i], title='Distribution of ' + str.upper(i).replace('_', ' '), labels={'value': i}, template='seaborn').show()


# fig5.write_html('figures/fig5.html')


# missing values
df.isna().sum()

price                     0
vehicle_type          19582
registration_year         0
gearbox                5989
power                     0
model                  9950
mileage                   0
registration_month        0
fuel_type             19290
brand                     0
not_repaired          47864
postal_code               0
year_crawled              0
month_crawled             0
day_crawled               0
time_crawled              0
dtype: int64


# fill not repaired missing values with no
df.not_repaired.fillna('no', inplace=True)


# check proper implementation
df.not_repaired.value_counts(dropna=False)

no     279108
yes     30780
Name: not_repaired, dtype: int64


# looking at missing gearbox values
df.gearbox.value_counts(dropna=False)

manual    243171
auto       60728
NaN         5989
Name: gearbox, dtype: int64


# looking at range of registration year
df.registration_year.value_counts().sort_values(axis=0, ascending=False)

1999    20122
2000    19072
2005    18982
2006    18649
2001    18243
2003    18128
2004    18000
2002    17462
2007    16174
1998    15720
2008    14592
2009    13874
1997    12948
2010    10381
2011     9594
1996     9505
1995     7482
2012     6935
2017     6809
2016     5911
1994     4281
2013     3986
1993     3034
1992     2680
2018     2593
1991     2435
2014     2403
1990     1913
1989     1136
2015     1075
1988      826
1987      566
1985      490
1986      434
1983      360
1984      343
1982      277
1980      255
1979      236
1978      226
1981      225
1972      160
1977      139
1973      129
1976      129
1974      126
1970      120
1971      120
1975      104
1969       89
1968       85
1966       80
1967       62
1965       48
1964       41
1963       38
1962       26
1961       25
2019       10
Name: registration_year, dtype: int64


# query for price is 0
df.query("price == 0").head()


# making copy of original dataset
df_copy = df.copy()


# Select only the object columns from the DataFrame
object_columns = df_copy.select_dtypes(include='object')


# fill missing obkect columns with tring NaN
object_columns.fillna('NaN', inplace=True)


# label encoding categorical columns
encoder = OrdinalEncoder()
ordinal = pd.DataFrame(encoder.fit_transform(object_columns), columns=object_columns.columns)


# checking implementation
ordinal.head()


# check the ordinal index parameters
ordinal.index

RangeIndex(start=0, stop=309888, step=1)


# start index at 1
ordinal.index = ordinal.index + 1


# check implementation
ordinal.head()


# checking index
ordinal.index

RangeIndex(start=1, stop=309889, step=1)


# new ordinal shape
ordinal.shape

(309888, 6)


# compare to original df shape
df.shape

(309888, 16)


# Replace the original object columns in the DataFrame with the encoded columns
df_copy[object_columns.columns] = ordinal


# for comparisons
df.head()


# check full implementation of label encoding
df_copy.head()


# using iterative imputer
imp = IterativeImputer()
data_imp = imp.fit_transform(df_copy)


# making a new dataframe for encoded data, rounding to nearest integer
df1 = pd.DataFrame(data_imp, columns=df.columns)
df1 = df1.round({'vehicle_type':0, 'gearbox':0, 'model':0, 'brand':0, 'not_repaired':0, 'fuel_type':0})


# visual of new dataset
df1.head()


# checking missing values
df1.isna().sum()

price                 0
vehicle_type          0
registration_year     0
gearbox               0
power                 0
model                 0
mileage               0
registration_month    0
fuel_type             0
brand                 0
not_repaired          0
postal_code           0
year_crawled          0
month_crawled         0
day_crawled           0
time_crawled          0
dtype: int64


# looking at vehicle type values
df.vehicle_type.value_counts(sort=True, dropna=False)

sedan          84603
small          71226
wagon          60397
bus            26483
NaN            19582
convertible    18983
coupe          15018
suv            11064
other           2532
Name: vehicle_type, dtype: int64


# checking vehicle type value counts
df1.vehicle_type.value_counts(sort=True, dropna=False)

5.0    112928
6.0     62216
8.0     52879
1.0     23181
0.0     17102
2.0     16579
3.0     13117
7.0      9674
4.0      2212
Name: vehicle_type, dtype: int64


# visual of original data
df.head()


# small cars
df.query("vehicle_type == 'small'").head()


# visual of encoded data
df1.head()


# type 4 cars
df1.query('vehicle_type == 4').head()


df1.columns

Index(['price', 'vehicle_type', 'registration_year', 'gearbox', 'power',
       'model', 'mileage', 'registration_month', 'fuel_type', 'brand',
       'not_repaired', 'postal_code', 'year_crawled', 'month_crawled',
       'day_crawled', 'time_crawled'],
      dtype='object')


# maybe look to eliminate more useless columns from features
features = df1.drop(['price','year_crawled', 'month_crawled',
       'day_crawled', 'time_crawled'], axis=1)
target = df1['price']


# train test split
features_train, features_test, target_train, target_test = train_test_split(
    features, target, test_size=0.25, random_state=19)


# train validate split
features_train, features_valid, target_train, target_valid = train_test_split(
    features_train, target_train, test_size=0.2, random_state=19)


# Scaling numerical columns
numeric = ['registration_year', 'power', 'mileage', 'postal_code']

scaler = StandardScaler()
scaler.fit(features_train[numeric])

features_train[numeric] = scaler.transform(features_train[numeric])
features_test[numeric] = scaler.transform(features_test[numeric])


# Creating new features to test prediction times
new_features = pd.DataFrame(
    [
        [4, 1999, 1, 75, 108, 150000, 3, 5, 38, 0, 10115],
        [5, 2008, 1, 150, 166, 150000, 9, 5, 31, 0, 71672]
    ],
    columns=features.columns
)


# for regressions
new_features


# %%timeit
# decision tree
best_model = None
best_result = 10000
best_depth = 0
for depth in range(1, 15): # choose hyperparameter range
    dec_tr = DecisionTreeRegressor(random_state=19, max_depth=depth)
    dec_tr.fit(features_train, target_train) # train model on training set
    predictions_valid_dt = dec_tr.predict(features_valid) # get model predictions on validation set
    result = mse(target_valid, predictions_valid_dt) ** 0.5
    if result < best_result:
        best_model = dec_tr
        best_result = result
        best_depth = depth

print(f"RMSE of the best model on the validation set (max_depth = {best_depth}): {best_result}")

RMSE of the best model on the validation set (max_depth = 5): 4689.009293240116


# %%timeit
# Decision tree prediction
dec_tr.predict(new_features)

array([16500., 17990.])


# %%timeit
# random forest
best_model = None
best_result = 10000
best_est = 0
best_depth = 0
for est in range(40, 51, 5):
    for depth in range (10, 15, 5):
        rf = RandomForestRegressor(random_state=19, n_estimators=est, max_depth=depth)
        rf.fit(features_train, target_train) # train model on training set
        predictions_valid_rf = rf.predict(features_valid) # get model predictions on validation set
        result = mse(target_valid, predictions_valid_rf) ** 0.5 # calculate RMSE on validation set
        if result < best_result:
            best_model = rf
            best_result = result
            best_est = est
            best_depth = depth

print("RMSE of the best model on the validation set:", best_result, "n_estimators:", best_est, "best_depth:", depth)

RMSE of the best model on the validation set: 8585.188109611085 n_estimators: 45 best_depth: 10


# %%timeit
# random forest prediction
rf.predict(new_features)

array([12314.99208565, 13272.2579312 ])


# %%timeit
# Linear regression
lr = LinearRegression() # initialize model constructor
lr.fit(features_train, target_train) # train model on training set
predictions_valid_lr = lr.predict(features_valid) # get model predictions on validation set

result = mse(target_valid, predictions_valid_lr) ** 0.5 # calculate RMSE on validation set
print("RMSE of the linear regression model on the validation set:", result)

RMSE of the linear regression model on the validation set: 206533854.8212574


#%%timeit
# linear regression prediction
lr.predict(new_features)

array([-2.37722899e+08, -2.30083006e+08])


# creating table of regression metrics
data = {'decision tree': (2229.78, 10.5, 1.53, 0.00234, 5.18 * 10 ** -4),  'random forest': (2172.63, 77, 10.7, 0.00426, 1.58 * 10 ** -4), 'linear regression': (3177.86, .0928, .0691, 0.00125, 3.01 * 10 ** -4)}
regr_scores = pd.DataFrame(data=data, columns=data.keys(), index=['RMSE', 'train_time', 'train_std', 'pred_time', 'pred_std'])
regr_scores = regr_scores.T


# regression RMSE
px.scatter(regr_scores.RMSE, color_discrete_sequence=[['orange', 'green', 'indigo']], labels={'index': 'Model', 'value': 'RMSE Score'}, title='Model Score Comparison', size=regr_scores.RMSE, log_y=True)


# comparing train and predict times
fig = go.Figure()
fig.add_trace(go.Bar(
    name='Train',
    x=['decision tree', 'random forest', 'linear regression'], y=regr_scores.train_time,
    error_y=dict(type='data', array=[1.5, 10.7, .0691])
))
fig.add_trace(go.Bar(
    name='Predict',
    x=['decision tree', 'random forest', 'linear regression'], y=regr_scores.pred_time,
    error_y=dict(type='data', array=[.00518, .00158, .00301]
)))

fig.update_layout(barmode='group', title='Model Prediction Time Comparison', template='seaborn')
fig.show()


# %%timeit
# XGB

X2 = df.drop(['price','year_crawled', 'month_crawled',
       'day_crawled', 'time_crawled'], axis=1)
y2 = df['price']

lbl = LabelEncoder()
X2.vehicle_type = lbl.fit_transform(X2.vehicle_type.astype(str))
X2.gearbox = lbl.fit_transform(X2.gearbox.astype(str))
X2.model = lbl.fit_transform(X2.model.astype(str))
X2.fuel_type = lbl.fit_transform(X2.fuel_type.astype(str))
X2.brand = lbl.fit_transform(X2.brand.astype(str))
X2.not_repaired = lbl.fit_transform(X2.not_repaired.astype(str))


X2_train, X2_test, y2_train, y2_test = train_test_split(
    X2, y2, test_size=0.25, random_state=19)

X2_train, X2_valid, y2_train, y2_valid = train_test_split(
    X2_train, y2_train, test_size=0.20, random_state=19)


# Create a XGBM 
xgbr = xgb.XGBRegressor(random_state=19, eval_metric='rmse')

xgbr.fit(X2_train, y2_train)

# Make predictions on the test set
predictions_xgbr = xgbr.predict(X2_valid)

result = mse(y2_valid, predictions_xgbr) ** 0.5 # calculate RMSE on validation set
print()
print("RMSE of the xgbm model on the validation set:", result)

RMSE of the xgbm model on the validation set: 1743.2028538931468


%%timeit
# xg boost prediction
predictions_xgbr = xgbr.predict(new_features)

4.87 ms ± 198 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


# xg boost parameters
xgbr.get_xgb_params

<bound method XGBModel.get_xgb_params of XGBRegressor(base_score=0.5, booster='gbtree', callbacks=None,
             colsample_bylevel=1, colsample_bynode=1, colsample_bytree=1,
             early_stopping_rounds=None, enable_categorical=False,
             eval_metric='rmse', feature_types=None, gamma=0, gpu_id=-1,
             grow_policy='depthwise', importance_type=None,
             interaction_constraints='', learning_rate=0.300000012, max_bin=256,
             max_cat_threshold=64, max_cat_to_onehot=4, max_delta_step=0,
             max_depth=6, max_leaves=0, min_child_weight=1, missing=nan,
             monotone_constraints='()', n_estimators=100, n_jobs=0,
             num_parallel_tree=1, predictor='auto', random_state=19, ...)>


# table of feature importance
xgbr_imp = [t for t in zip(features, xgbr.feature_importances_)]
xgbr_imp_df = pd.DataFrame(xgbr_imp, columns=['feature', 'varimp'])
xgbr_imp_df = xgbr_imp_df.sort_values('varimp', ascending=False)


# %%timeit
# LGBM

X = df.drop(['price','year_crawled', 'month_crawled',
       'day_crawled', 'time_crawled'], axis=1)
y = df['price']

obj_feat = list(X.loc[:, X.dtypes == 'object'].columns.values)

for feature in obj_feat:
    X[feature] = pd.Series(X[feature], dtype='category')

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.25, random_state=19)

X_train, X_valid, y_train, y_valid = train_test_split(
    X_train, y_train, test_size=0.20, random_state=19)

# Create a LightGBM dataset
lgb_train = lgb.Dataset(X_train, y_train, params={'verbose': -1}, free_raw_data=False)
lgb_valid = lgb.Dataset(X_valid, y_valid, reference=lgb_train, params={'verbose': -1}, free_raw_data=False)

# Define the parameters for the LightGBM model
params = {
    'objective': 'regression',
    'metric': 'root_mean_squared_error',
    'boosting_type': 'gbdt',
    'random_state': 19,
    'verbose':-1
}


# Train the LightGBM model
lgbm = lgb.train(params, lgb_train, valid_sets=lgb_valid, num_boost_round=4000, early_stopping_rounds=500, verbose_eval=False)

# Make predictions on the validation set
predictions_valid_lgbm = lgbm.predict(X_valid)

result = mse(y_valid, predictions_valid_lgbm) ** 0.5 # calculate RMSE on validation set
print()
print("RMSE of the lgbm model on the validation set:", result)

c:\Users\XIX\anaconda3\lib\site-packages\lightgbm\engine.py:181: UserWarning:

'early_stopping_rounds' argument is deprecated and will be removed in a future release of LightGBM. Pass 'early_stopping()' callback via 'callbacks' argument instead.

c:\Users\XIX\anaconda3\lib\site-packages\lightgbm\engine.py:239: UserWarning:

'verbose_eval' argument is deprecated and will be removed in a future release of LightGBM. Pass 'log_evaluation()' callback via 'callbacks' argument instead.

c:\Users\XIX\anaconda3\lib\site-packages\lightgbm\basic.py:1780: UserWarning:

Overriding the parameters from Reference Dataset.

c:\Users\XIX\anaconda3\lib\site-packages\lightgbm\basic.py:1513: UserWarning:

categorical_column in param dict is overridden.

RMSE of the lgbm model on the validation set: 1633.1585371342076


# %%timeit
# light gbm prediction
lgbm.predict(X_valid[:2])

array([2824.93089504, 1220.24599181])


# table of feature importance
feat_imp = [t for t in zip(features, lgbm.feature_importance())]
feat_imp_df = pd.DataFrame(feat_imp, columns=['feature', 'varimp'])
feat_imp_df = feat_imp_df.sort_values('varimp', ascending=False)


# fill missing values as string NaN
df.fillna('NaN', inplace=True)


# %%timeit
# catboost
X1 = df.drop(['price','year_crawled', 'month_crawled',
       'day_crawled', 'time_crawled'], axis=1)
y1 = df['price']


X1_train, X1_test, y1_train, y1_test = train_test_split(
    X1, y1, test_size=0.25, random_state=19)

X1_train, X1_valid, y1_train, y1_valid = train_test_split(
    X1_train, y1_train, test_size=0.20, random_state=19)

# fill missing values as string NaN

cat_features = [
    'vehicle_type',
    'gearbox',
    'model',
    'fuel_type', 
    'brand', 
    'not_repaired'
]

catb = CatBoostRegressor(task_type='GPU', loss_function='RMSE', eval_metric='RMSE', iterations=1500, random_seed=19)

catb.fit(X1_train, y1_train, eval_set=(X1_valid, y1_valid), cat_features=cat_features, verbose=100, use_best_model=True)

# Make predictions on the test set
predictions_valid_catb = catb.predict(X1_valid)

result = mse(y1_valid, predictions_valid_catb) ** 0.5 # calculate RMSE on validation set
print()
print("Catboost model on the validation set: ")
catb.best_score_

Learning rate set to 0.097278
0:	learn: 4294.0143420	test: 4312.0688882	best: 4312.0688882 (0)	total: 108ms	remaining: 2m 42s
100:	learn: 1850.7068059	test: 1865.4927636	best: 1865.4927636 (100)	total: 11.4s	remaining: 2m 38s
200:	learn: 1772.7411209	test: 1799.9096708	best: 1799.9096708 (200)	total: 22.5s	remaining: 2m 25s
300:	learn: 1736.0036154	test: 1775.2809012	best: 1775.2809012 (300)	total: 33.4s	remaining: 2m 13s
400:	learn: 1711.0098842	test: 1759.1834077	best: 1759.1834077 (400)	total: 44.8s	remaining: 2m 2s
500:	learn: 1691.8484583	test: 1749.0225927	best: 1749.0161440 (499)	total: 56.6s	remaining: 1m 52s
600:	learn: 1674.9005953	test: 1740.0733097	best: 1740.0733097 (600)	total: 1m 8s	remaining: 1m 42s
700:	learn: 1661.0412251	test: 1734.2242451	best: 1734.2242451 (700)	total: 1m 19s	remaining: 1m 30s
800:	learn: 1646.8603198	test: 1727.0059576	best: 1726.9695270 (798)	total: 1m 31s	remaining: 1m 19s
900:	learn: 1634.0056510	test: 1722.4670428	best: 1722.4670428 (900)	total: 1m 42s	remaining: 1m 8s
1000:	learn: 1623.3198191	test: 1718.4793385	best: 1718.4793385 (1000)	total: 1m 54s	remaining: 57.3s
1100:	learn: 1614.7202606	test: 1715.9938970	best: 1715.9706353 (1099)	total: 2m 6s	remaining: 45.9s
1200:	learn: 1603.9303403	test: 1711.9758283	best: 1711.9758283 (1200)	total: 2m 18s	remaining: 34.5s
1300:	learn: 1594.9271146	test: 1708.6188187	best: 1708.6188187 (1300)	total: 2m 30s	remaining: 22.9s
1400:	learn: 1587.0520286	test: 1705.6079899	best: 1705.6079899 (1400)	total: 2m 41s	remaining: 11.4s
1499:	learn: 1579.7353030	test: 1703.4617586	best: 1703.4617586 (1499)	total: 2m 53s	remaining: 0us
bestTest = 1703.461759
bestIteration = 1499

Catboost model on the validation set:

{'learn': {'RMSE': 1579.7353029785438},
 'validation': {'RMSE': 1703.4617585957194}}


# %%timeit
# catboost predictions
catb.predict(new_features)

array([-1357.82073855,  4612.8681004 ])


# table of feature importance
feat_import = [t for t in zip(features, catb.get_feature_importance())]
feat_import_df = pd.DataFrame(feat_import, columns=['feature', 'varimp'])
feat_import_df = feat_import_df.sort_values('varimp', ascending=False)


# boosting model metrics
data2 = {'xgb': (1754.20, 13.9, 1.98, .0046, 2.67 * 10 ** -4), 'lgb': (1629.89, 66 , 3.75, 0.0113, .012), 'catboost': (1703.46, 277, 11.7, 0.00226, 2.26 * 10 ** -4)}
boost_scores = pd.DataFrame(data=data2, columns=data2.keys(), index=['RMSE', 'train_time', 'train_std', 'pred_time', 'pred_std'])
boost_scores = boost_scores.T


# boosting RMSE
px.scatter(boost_scores.RMSE, color_discrete_sequence=[['yellow', 'cyan', 'violet']], labels={'index': 'Model', 'value': 'RMSE Score'}, title='Model Score Comparison', size=boost_scores.RMSE, log_y=True)


# comparing train and predict times
fig = go.Figure()
fig.add_trace(go.Bar(
    name='Train',
    x=['xg boost', 'light gbm', 'catboost'], y=boost_scores.train_time,
    error_y=dict(type='data', array=[1.98, 3.75, 11.7])
))
fig.add_trace(go.Bar(
    name='Predict',
    x=['xg boost', 'light gbm', 'catboost'], y=boost_scores.pred_time,
    error_y=dict(type='data', array=[.000267, .012, .000226]
)))

fig.update_layout(barmode='group', title='Model Prediction Time Comparison', template='seaborn')
fig.show()


# combining first two feature importance tables
feat1 = xgbr_imp_df.merge(feat_imp_df, on='feature')
final_feat = feat1.merge(feat_import_df, on='feature')


# combining final feature table
final_feat.columns = ['features', 'xgb', 'lgbm', 'catboost']


# normalize importance to same scale
final_feat.lgbm = (final_feat.lgbm - final_feat.lgbm.mean())/ final_feat.lgbm.std()
final_feat.catboost = final_feat.catboost/100


# feature importance of boosting models
px.bar(final_feat, x='features', y=['xgb', 'lgbm', 'catboost'], barmode='group', template='plotly_dark', title='Feature Importance of Boosting Models', labels={'value': 'Importance'})


# regression scores mean train time
regr_scores.train_time.mean()

29.197599999999998


# regression scores mean RMSE
regr_scores.RMSE.mean()

2526.7566666666667


# boost scores mean train time
boost_scores.train_time.mean()

118.96666666666665


# boost scores mean RMSE
boost_scores.RMSE.mean()

1695.8500000000001


# model training time comparisons
fig = go.Figure()
fig.add_trace(go.Bar(
    name='regression',
    x=['decision tree', 'random forest', 'linear regression'], y=regr_scores.train_time,
    error_y=dict(type='data', array=[1.5, 10.7, .0691])
))
fig.add_trace(go.Bar(
    name='boost',
    x=['xg boost', 'light gbm', 'catboost'], y=boost_scores.train_time,
    error_y=dict(type='data', array=[1.98, 3.75, 11.7])
))
fig.update_layout(barmode='group', title='Model Training Time Comparison', template='seaborn')
fig.show()


# comparing prediction times
fig = go.Figure()

fig.add_trace(go.Bar(
    name='regression',
    x=['decision tree', 'random forest', 'linear regression'], y=regr_scores.pred_time,
    error_y=dict(type='data', array=[.00518, .00158, .00301]
)))
fig.add_trace(go.Bar(
    name='boost',
    x=['xg boost', 'light gbm', 'catboost'], y=boost_scores.pred_time,
    error_y=dict(type='data', array=[.000267, .012, .000226]
)))

fig.update_layout(barmode='group', title='Model Prediction Time Comparison', template='seaborn')
fig.show()


# comparing RMSE Scores
fig = go.Figure()

fig.add_trace(go.Scatter(
    y=regr_scores.RMSE,
    x=['decision tree', 'random forest', 'linear regression'] )
)
fig.add_trace(go.Scatter(
    y=boost_scores.RMSE, 
    x=['xg boost', 'light gbm', 'catboost'] )
)

fig.update_layout(title='Model RMSE Comparison', template='plotly_dark')
fig.show()


# Make predictions on the validation set
predictions_test_lgbm = lgbm.predict(X_test)

result = mse(y_test, predictions_test_lgbm) ** 0.5 # calculate RMSE on test set
print()
print("RMSE of the lgbm model on the test set:", result)

RMSE of the lgbm model on the test set: 1637.7809549995363

	price	registration_year	power	mileage	registration_month	number_of_pictures	postal_code	year_crawled	month_crawled	day_crawled	time_crawled
price	1.000000	0.026916	0.158872	-0.333199	0.110581	NaN	0.076055	NaN	-0.007529	-0.003374	0.017560
registration_year	0.026916	1.000000	-0.000828	-0.053447	-0.011619	NaN	-0.003459	NaN	-0.001598	0.001513	-0.001617
power	0.158872	-0.000828	1.000000	0.024002	0.043380	NaN	0.021665	NaN	0.000750	-0.001815	-0.000617
mileage	-0.333199	-0.053447	0.024002	1.000000	0.009571	NaN	-0.007698	NaN	0.000245	0.001013	-0.011518
registration_month	0.110581	-0.011619	0.043380	0.009571	1.000000	NaN	0.013995	NaN	0.003955	-0.002674	0.012123
number_of_pictures	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
postal_code	0.076055	-0.003459	0.021665	-0.007698	0.013995	NaN	1.000000	NaN	-0.010531	0.004484	0.016590
year_crawled	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
month_crawled	-0.007529	-0.001598	0.000750	0.000245	0.003955	NaN	-0.010531	NaN	1.000000	-0.572049	-0.000910
day_crawled	-0.003374	0.001513	-0.001815	0.001013	-0.002674	NaN	0.004484	NaN	-0.572049	1.000000	0.006911
time_crawled	0.017560	-0.001617	-0.000617	-0.011518	0.012123	NaN	0.016590	NaN	-0.000910	0.006911	1.000000

	price	registration_year	power	mileage	registration_month	number_of_pictures	postal_code	year_crawled	month_crawled	day_crawled	time_crawled
count	354369.000000	354369.000000	354369.000000	354369.000000	354369.000000	354369.0	354369.000000	354369.0	354369.000000	354369.000000	354369.000000
mean	4416.656776	2004.234448	110.094337	128211.172535	5.714645	0.0	50508.689087	2016.0	4.406150	14.609021	14.909665
std	4514.158514	90.227958	189.850405	37905.341530	3.726421	0.0	25783.096248	0.0	2.804298	10.190826	5.422938
min	0.000000	1000.000000	0.000000	5000.000000	0.000000	0.0	1067.000000	2016.0	1.000000	3.000000	0.000000
25%	1050.000000	1999.000000	69.000000	125000.000000	3.000000	0.0	30165.000000	2016.0	3.000000	4.000000	12.000000
50%	2700.000000	2003.000000	105.000000	150000.000000	6.000000	0.0	49413.000000	2016.0	3.000000	16.000000	16.000000
75%	6400.000000	2008.000000	143.000000	150000.000000	9.000000	0.0	71083.000000	2016.0	5.000000	24.000000	19.000000
max	20000.000000	9999.000000	20000.000000	150000.000000	12.000000	0.0	99998.000000	2016.0	12.000000	31.000000	23.000000

Rusty Bargain Used Car Sales ¶

Import Data ¶

Conclusions ¶

Data Analysis ¶

Conclusions ¶

Model Preparation ¶

Conclusions ¶

Modeling ¶

Regressions ¶

Decision Tree ¶

Random Forest ¶

Linear Regression ¶

Comparing Regressions ¶

Conclusions ¶

Boosting ¶

XG Boost ¶

Light GBM ¶

Catboost ¶

Comparing Boosting Models ¶

Score ¶

Conclusions ¶

Feature Importance ¶

Conclusion ¶

Comparison between Regression and Boosting ¶

Final Model ¶

Light GBM ¶

Overall Conclusions ¶

	DateCrawled	Price	VehicleType	RegistrationYear	Gearbox	Power	Model	Mileage	RegistrationMonth	FuelType	Brand	NotRepaired	DateCreated	PostalCode	LastSeen
0	24/03/2016 11:52	480	NaN	1993	manual	0	golf	150000	0	petrol	volkswagen	NaN	24/03/2016 00:00	70435	07/04/2016 03:16
1	24/03/2016 10:58	18300	coupe	2011	manual	190	NaN	125000	5	gasoline	audi	yes	24/03/2016 00:00	66954	07/04/2016 01:46
2	14/03/2016 12:52	9800	suv	2004	auto	163	grand	125000	8	gasoline	jeep	NaN	14/03/2016 00:00	90480	05/04/2016 12:47
3	17/03/2016 16:54	1500	small	2001	manual	75	golf	150000	6	petrol	volkswagen	no	17/03/2016 00:00	91074	17/03/2016 17:40
4	31/03/2016 17:25	3600	small	2008	manual	69	fabia	90000	7	gasoline	skoda	no	31/03/2016 00:00	60437	06/04/2016 10:17

	price	vehicle_type	registration_year	gearbox	power	model	mileage	registration_month	fuel_type	brand	not_repaired	postal_code	year_crawled	month_crawled	day_crawled	time_crawled
15	450	small	1910	NaN	0	ka	5000	0	petrol	ford	NaN	24148	2016	11	3	21
1928	7000	suv	1945	manual	48	other	150000	2	petrol	volkswagen	no	58135	2016	3	25	15
2273	1800	convertible	1925	NaN	0	NaN	5000	1	NaN	sonstige_autos	no	79288	2016	3	15	21
3333	10500	sedan	1955	manual	30	other	60000	0	petrol	ford	NaN	53498	2016	3	15	21
6629	0	small	1910	NaN	0	NaN	5000	1	other	sonstige_autos	NaN	93105	2016	2	4	13

	vehicle_type	registration_year	gearbox	power	model	mileage	registration_month	fuel_type	brand	not_repaired	postal_code	year_crawled	month_crawled	day_crawled	time_crawled
7	sedan	1980	manual	50	other	40000	7	petrol	volkswagen	no	19348	2016	3	21	18
152	bus	2004	manual	101	meriva	150000	10	lpg	opel	yes	27432	2016	11	3	18
231	wagon	2001	manual	115	mondeo	150000	0	NaN	ford	no	57627	2016	3	16	22
466	NaN	2016	auto	197	3er	150000	12	petrol	bmw	no	99867	2016	3	24	10
563	sedan	1998	auto	230	NaN	150000	10	petrol	saab	no	71691	2016	3	21	12

	vehicle_type	gearbox	model	fuel_type	brand	not_repaired
0	3.0	2.0	26.0	6.0	1.0	1.0
1	7.0	1.0	118.0	6.0	14.0	0.0
2	6.0	2.0	117.0	6.0	38.0	0.0
3	6.0	2.0	102.0	6.0	31.0	0.0
4	5.0	2.0	11.0	6.0	2.0	1.0

	price	vehicle_type	registration_year	gearbox	power	model	mileage	registration_month	fuel_type	brand	not_repaired	postal_code	year_crawled	month_crawled	day_crawled	time_crawled
0	18300.0	3.0	2011.0	2.0	190.0	26.0	125000.0	5.0	6.0	1.0	1.0	66954.0	2016.0	3.0	24.0	10.0
1	9800.0	7.0	2004.0	1.0	163.0	118.0	125000.0	8.0	6.0	14.0	0.0	90480.0	2016.0	3.0	14.0	12.0
2	1500.0	6.0	2001.0	2.0	75.0	117.0	150000.0	6.0	6.0	38.0	0.0	91074.0	2016.0	3.0	17.0	16.0
3	3600.0	6.0	2008.0	2.0	69.0	102.0	90000.0	7.0	6.0	31.0	0.0	60437.0	2016.0	3.0	31.0	17.0
4	650.0	5.0	1995.0	2.0	102.0	11.0	150000.0	10.0	6.0	2.0	1.0	33775.0	2016.0	4.0	4.0	17.0

	price	vehicle_type	registration_year	gearbox	power	model	mileage	registration_month	fuel_type	brand	postal_code	year_crawled	month_crawled	day_crawled	time_crawled
31	3500.0	4.0	2004.0	2.0	122.0	130.0	150000.0	11.0	0.0	24.0	67071.0	2016.0	3.0	17.0	18.0
32	350.0	4.0	2016.0	2.0	75.0	167.0	150000.0	4.0	6.0	38.0	19386.0	2016.0	8.0	3.0	7.0
61	1280.0	4.0	1992.0	2.0	109.0	237.0	150000.0	9.0	6.0	20.0	86163.0	2016.0	3.0	15.0	9.0
315	16500.0	4.0	1983.0	2.0	218.0	26.0	150000.0	10.0	6.0	0.0	42719.0	2016.0	3.0	30.0	13.0
331	2950.0	4.0	2005.0	2.0	131.0	167.0	150000.0	6.0	6.0	5.0	91781.0	2016.0	12.0	3.0	12.0