# Import necessary libraries
import pandas as pd 
import plotly_express as px
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, mean_squared_error as mse, f1_score, accuracy_score, recall_score, precision_score, classification_report, confusion_matrix, roc_auc_score, roc_curve
from sklearn import metrics
from sklearn.preprocessing import StandardScaler 
from sklearn.utils import shuffle
from sklearn.preprocessing import OrdinalEncoder, LabelEncoder


# Reading the data
df = pd.read_csv('datasets/Churn.csv')


# First look at the data
df.head()


# overall description of the data
df.describe()


# looking for correlations 
df.corr()


# Changing column names to lowercase with underscores
df.columns = df.columns.str.lower()
df.rename(columns={'rownumber': 'row_number', 'customerid': 'customer_id', 'creditscore': 'credit_score','numofproducts': 'num_of_products', 'hascrcard': 'has_cr_card', 'isactivemember': 'is_active_member', 'estimatedsalary': 'estimated_salary' }, inplace=True)


# looking for missing values
df.isna().sum()

row_number            0
customer_id           0
surname               0
credit_score          0
geography             0
gender                0
age                   0
tenure              909
balance               0
num_of_products       0
has_cr_card           0
is_active_member      0
estimated_salary      0
exited                0
dtype: int64


# Counting the different values of geography 
df.geography.value_counts()

France     5014
Germany    2509
Spain      2477
Name: geography, dtype: int64


# Counting the different vales of tenure
df.tenure.value_counts()

1.0     952
2.0     950
8.0     933
3.0     928
5.0     927
7.0     925
4.0     885
9.0     882
6.0     881
10.0    446
0.0     382
Name: tenure, dtype: int64


# mean and median tenure
print(df.tenure.mean())
print(df.tenure.median())

4.997690023099769
5.0


# filling missing tenure values with median
df.tenure.fillna(df.tenure.median(), inplace=True)


# looking for duplicates, hidden duplicates
print(df.duplicated().sum())
print(df.customer_id.duplicated().sum())

0
0


# Looking for hidden duplicates
df.customer_id.nunique()

10000


# Ensuring two values for gender
df.gender.nunique()

2


# ensuring these features have two values
print(df.has_cr_card.nunique())
print(df.is_active_member.nunique())
print(df.exited.nunique())

2
2
2


# Dropping all missing rows
df = df.dropna()


# No missing values
df.isna().sum()

row_number          0
customer_id         0
surname             0
credit_score        0
geography           0
gender              0
age                 0
tenure              0
balance             0
num_of_products     0
has_cr_card         0
is_active_member    0
estimated_salary    0
exited              0
dtype: int64


# Looking at the balance of genders
df.gender.value_counts()

Male      5457
Female    4543
Name: gender, dtype: int64


# target is unbalanced
df.exited.value_counts()

0    7963
1    2037
Name: exited, dtype: int64


# number of geography options
df.geography.nunique()

3


# Changing categorical columns to labels
encoder = OrdinalEncoder()
df_ordinal = pd.DataFrame(encoder.fit_transform(df[['surname', 'geography', 'gender']]), columns=['surname', 'geography', 'gender'])


# merge ordinal columns back into df
df = df.merge(df_ordinal, left_index=True, right_index=True)


# drop the left duplicate columns and rename the right columns
df = df.drop(['surname_x', 'geography_x', 'gender_x'], axis=1)
df.rename(columns={'surname_y': 'surname', 'geography_y': 'geography', 'gender_y': 'gender'}, inplace=True)


# counting the target values
df.exited.value_counts()

0    7963
1    2037
Name: exited, dtype: int64


# Geography: 0 - France, 1 - Germany, 2 - Spain
df.geography.value_counts()

0.0    5014
1.0    2509
2.0    2477
Name: geography, dtype: int64


# Target and Features
target = df['exited']
features = df.drop(['exited', 'row_number', 'customer_id', 'surname'], axis=1)


# Splitting dataset into 3: train, test, valid

features_train, features_test, target_train, target_test = train_test_split( 
    features, target, test_size=0.2, random_state=19) # split 20% of data to make validation set
features_train, features_valid, target_train, target_valid = train_test_split(
    features_train, target_train, test_size=0.25, random_state=19) # 0.25 x 0.8 = 0.2


# Visual of the split data
print(features_train.shape)
print(target_train.shape)
print(features_valid.shape)
print(target_valid.shape)
print(features_test.shape)
print(target_test.shape)

(6000, 10)
(6000,)
(2000, 10)
(2000,)
(2000, 10)
(2000,)


# Identify column names
features_train.columns

Index(['credit_score', 'age', 'tenure', 'balance', 'num_of_products',
       'has_cr_card', 'is_active_member', 'estimated_salary', 'geography',
       'gender'],
      dtype='object')


# Using these numeric columns
numeric = ['credit_score', 'age', 'tenure', 'balance', 'num_of_products','estimated_salary']

scaler = StandardScaler()
scaler.fit(features_train[numeric])

StandardScaler()


# Scaling the features
features_train[numeric] = scaler.transform(features_train[numeric])
features_valid[numeric] = scaler.transform(features_valid[numeric])
features_test[numeric] = scaler.transform(features_test[numeric])

print(features_train.shape)

(6000, 10)


# Visual of the features
features_train


# Target values before upsampling
target.value_counts()

0    7963
1    2037
Name: exited, dtype: int64


# function to upsample the features and target
def upsample(features, target, repeat):
    features_zeros = features_train[target_train == 0]
    features_ones = features_train[target_train == 1]
    target_zeros = target_train[target_train == 0]
    target_ones = target_train[target_train == 1]
    

    features_upsampled = pd.concat([features_zeros] + [features_ones] * repeat)
    target_upsampled = pd.concat([target_zeros] + [target_ones] * repeat)

    features_upsampled, target_upsampled = shuffle(
        features_upsampled, target_upsampled, random_state=19
    )

    return features_upsampled, target_upsampled


features_upsampled, target_upsampled = upsample(
    features_train, target_train, 4)


# Results of the upsampling, 
print(features_upsampled.shape)
print(target_upsampled.shape)

(9666, 10)
(9666,)


# looking at upsampled target counts
target_upsampled.value_counts()

1    4888
0    4778
Name: exited, dtype: int64


# Decision Tree with loop for depth
best_model = None
best_result = 0
best_depth = 0
for depth in range(1,20):
    model_0 = DecisionTreeClassifier(random_state=19, max_depth=depth)
    model_0.fit(features_train, target_train) # using unbalanced data for comparison
    predictions_valid0 = model_0.predict(features_valid)
    result0 = f1_score(target_valid, predictions_valid0) 
    if result0 > best_result:
        best_model = model_0
        best_result = result0
        best_depth = depth  

        
print("Best Depth:", best_depth, ","   "  F1 of the best model on the validation set:", best_result)

# AUC-ROC score 
probabilities_valid0 = model_0.predict_proba(features_valid)
probabilities_one_valid0 = probabilities_valid0[:, 1]

auc_roc = roc_auc_score(target_valid, probabilities_one_valid0)

print('AUC-ROC score:', auc_roc)

Best Depth: 2 ,  F1 of the best model on the validation set: 0.5122302158273381
AUC-ROC score: 0.6598716710278545


# Decision Tree with loop for depth
best_model = None
best_result = 0
best_depth = 0
for depth in range(1,20):
    model = DecisionTreeClassifier(random_state=19, max_depth=depth)
    model.fit(features_upsampled, target_upsampled) # using balanced, upsampled data
    predictions_valid = model.predict(features_valid)
    result = f1_score(target_valid, predictions_valid) 
    if result > best_result:
        best_model = model
        best_result = result
        best_depth = depth  

        
print("Best Depth:", best_depth, ","   "  F1 of the best model on the validation set:", best_result)

# AUC-ROC score 
probabilities_valid = model.predict_proba(features_valid)
probabilities_one_valid = probabilities_valid[:, 1]

auc_roc = roc_auc_score(target_valid, probabilities_one_valid)

print('AUC-ROC score:', auc_roc)

Best Depth: 6 ,  F1 of the best model on the validation set: 0.5752636625119847
AUC-ROC score: 0.6804484788828838


# Logistic Regression, no class weights
model_002 = LogisticRegression(random_state=19, solver='saga', multi_class='multinomial')  
model_002.fit(features_train, target_train)  # using unbalanced data for comparison
predictions002_valid = model_002.predict(features_valid)
f1_valid_002 = f1_score(target_valid, predictions002_valid)  

print(
    "F1 score of the logistic regression model on the validation set:",
    f1_valid_002
)

# AUC-ROC score 
probabilities_valid002 = model_002.predict_proba(features_valid)
probabilities_one_valid002 = probabilities_valid002[:, 1]

auc_roc = roc_auc_score(target_valid, probabilities_one_valid002)

print('AUC-ROC score:', auc_roc)

F1 score of the logistic regression model on the validation set: 0.25621414913957935
AUC-ROC score: 0.7691851507666725


# Logistic Regression, balanced 
model_02 = LogisticRegression(random_state=19, solver='saga', class_weight='balanced', multi_class='multinomial')  
model_02.fit(features_train, target_train)  # using balanced data for comparison
predictions02_valid = model_02.predict(features_valid)
f1_valid_02 = f1_score(target_valid, predictions02_valid)  

print(
    "F1 score of the logistic regression model on the validation set:",
    f1_valid_02
)

# AUC-ROC score 
probabilities_valid02 = model_02.predict_proba(features_valid)
probabilities_one_valid02 = probabilities_valid02[:, 1]

auc_roc = roc_auc_score(target_valid, probabilities_one_valid02)

print('AUC-ROC score:', auc_roc)

F1 score of the logistic regression model on the validation set: 0.49613733905579394
AUC-ROC score: 0.7746936978797291


# Logistic Regression, not balanced, upsampled
model20 = LogisticRegression(random_state=19, solver='saga', multi_class='multinomial')  
model20.fit(features_upsampled, target_upsampled)  #  upsampled data
predictions20_valid = model20.predict(features_valid)
f1_valid20 = f1_score(target_valid, predictions20_valid)  

print(
    "F1 score of the logistic regression model on the validation set:",
    f1_valid20
)

# AUC-ROC score 
probabilities_valid20 = model20.predict_proba(features_valid)
probabilities_one_valid20 = probabilities_valid20[:, 1]

auc_roc = roc_auc_score(target_valid, probabilities_one_valid20)

print('AUC-ROC score:', auc_roc)

F1 score of the logistic regression model on the validation set: 0.4923857868020305
AUC-ROC score: 0.7747747059255092


# Logistic Regression
model2 = LogisticRegression(random_state=19, solver='saga', class_weight='balanced', multi_class='multinomial')  
model2.fit(features_upsampled, target_upsampled)  # using balanced, upsampled data
predictions2_valid = model2.predict(features_valid)
f1_valid2 = f1_score(target_valid, predictions2_valid)  

print(
    "F1 score of the logistic regression model on the validation set:",
    f1_valid2
)

# AUC-ROC score 
probabilities_valid2 = model2.predict_proba(features_valid)
probabilities_one_valid2 = probabilities_valid2[:, 1]

auc_roc = roc_auc_score(target_valid, probabilities_one_valid2)

print('AUC-ROC score:', auc_roc)

F1 score of the logistic regression model on the validation set: 0.4965635738831615
AUC-ROC score: 0.7747074539629748


# Random Forest, unbalanced, not upsampled (2 min run)
best_model = None
best_result = 0
best_est = 0
best_depth = 0
for est in range(500, 800, 50):
    for depth in range (700, 1001, 100):
        model_003 = RandomForestClassifier(random_state=19, n_estimators=est, max_depth=depth)
        model_003.fit(features_train, target_train) # using unbalanced data for comparison
        predictions_valid_003 = model_003.predict(features_valid) 
        result_003 = f1_score(target_valid, predictions_valid_003) 
        if result_003 > best_result:
            best_model = model_003
            best_result = result_003
            best_est = est
            best_depth1 = depth

print("F1 of the best model on the validation set:", best_result, "n_estimators:", best_est, "best_depth:", depth)


# AUC-ROC score 
probabilities_valid003 = model_003.predict_proba(features_valid)
probabilities_one_valid003 = probabilities_valid003[:, 1]

auc_roc = roc_auc_score(target_valid, probabilities_one_valid003)

print('AUC-ROC score:', auc_roc)

F1 of the best model on the validation set: 0.5617283950617283 n_estimators: 500 best_depth: 1000
AUC-ROC score: 0.8555978088087843


# Random Forest, balanced, not upsampled (2 min run)
best_model = None
best_result = 0
best_est = 0
best_depth = 0
for est in range(500, 800, 50):
    for depth in range (700, 1001, 100):
        model_03 = RandomForestClassifier(random_state=19, class_weight='balanced', n_estimators=est, max_depth=depth)
        model_03.fit(features_train, target_train) # using unbalanced data for comparison
        predictions_valid_03 = model_03.predict(features_valid) 
        result_03 = f1_score(target_valid, predictions_valid_03) 
        if result_03 > best_result:
            best_model = model_03
            best_result = result_03
            best_est = est
            best_depth1 = depth

print("F1 of the best model on the validation set:", best_result, "n_estimators:", best_est, "best_depth:", depth)


# AUC-ROC score 
probabilities_valid03 = model_03.predict_proba(features_valid)
probabilities_one_valid03 = probabilities_valid03[:, 1]

auc_roc = roc_auc_score(target_valid, probabilities_one_valid03)

print('AUC-ROC score:', auc_roc)

F1 of the best model on the validation set: 0.5246422893481716 n_estimators: 500 best_depth: 1000
AUC-ROC score: 0.8574204898388397


# Random Forest, not balanced, upsampled (2 min run)
best_model = None
best_result = 0
best_est = 0
best_depth = 0
for est in range(200, 400, 20):
    for depth in range (900, 1001, 50):
        model3 = RandomForestClassifier(random_state=19, n_estimators=est, max_depth=depth)
        model3.fit(features_upsampled, target_upsampled) # using balanced, upsampled data
        predictions_valid3 = model3.predict(features_valid) 
        result3 = f1_score(target_valid, predictions_valid3) 
        if result3 > best_result:
            best_model = model3
            best_result = result3
            best_est = est
            best_depth1 = depth

print("F1 of the best model on the validation set:", best_result, "n_estimators:", best_est, "best_depth:", depth)

# AUC-ROC score 
probabilities_valid3 = model3.predict_proba(features_valid)
probabilities_one_valid3 = probabilities_valid3[:, 1]

auc_roc = roc_auc_score(target_valid, probabilities_one_valid3)

print('AUC-ROC score:', auc_roc)

F1 of the best model on the validation set: 0.5842391304347826 n_estimators: 360 best_depth: 1000
AUC-ROC score: 0.8495825792961776


# Random Forest, balanced, upsampled (2 min run)
best_model = None
best_result = 0
best_est = 0
best_depth = 0
for est in range(200, 400, 20):
    for depth in range (900, 1001, 50):
        model30 = RandomForestClassifier(random_state=19, class_weight='balanced', n_estimators=est, max_depth=depth)
        model30.fit(features_upsampled, target_upsampled) # using balanced, upsampled data
        predictions_valid30 = model30.predict(features_valid) 
        result30 = f1_score(target_valid, predictions_valid30) 
        if result3 > best_result:
            best_model = model30
            best_result = result30
            best_est = est
            best_depth1 = depth

print("F1 of the best model on the validation set:", best_result, "n_estimators:", best_est, "best_depth:", depth)

# AUC-ROC score 
probabilities_valid30 = model30.predict_proba(features_valid)
probabilities_one_valid30 = probabilities_valid30[:, 1]

auc_roc = roc_auc_score(target_valid, probabilities_one_valid30)

print('AUC-ROC score:', auc_roc)

# final_model = RandomForestClassifier(random_state=19, n_estimators=320, max_depth=10000) 
# final_model.fit(features_train, target_train)

F1 of the best model on the validation set: 0.578082191780822 n_estimators: 220 best_depth: 1000
AUC-ROC score: 0.8531966080555623


# Final model F1 score
final_model = RandomForestClassifier(random_state=19, n_estimators=360, max_depth=1000) 
final_model.fit(features_upsampled, target_upsampled)
final_prediction = final_model.predict(features_test)
print(f1_score(final_prediction, target_test))

0.5940054495912805


# AUC-ROC score 
probabilities_test = final_model.predict_proba(features_test)
probabilities_one_test = probabilities_test[:, 1]

auc_roc = roc_auc_score(target_test, probabilities_one_test)

print(auc_roc)

0.8483975071124363


# visual of AUC ROC
fpr, tpr, thresholds = roc_curve(target_test, probabilities_one_test)
roc_data = {'fpr': fpr, 'tpr': tpr, 'thresholds': thresholds}
fig = px.line(roc_data, x='fpr', y='tpr', labels={'x':'False Positive Rate', 'y': 'True Positive Rate'}, 
              title=f'AUC ROC Curve (AUC={auc_roc:.3f})')
fig.show()

	RowNumber	CustomerId	Surname	CreditScore	Geography	Gender	Age	Tenure	Balance	NumOfProducts	HasCrCard	IsActiveMember	EstimatedSalary	Exited
0	1	15634602	Hargrave	619	France	Female	42	2.0	0.00	1	1	1	101348.88	1
1	2	15647311	Hill	608	Spain	Female	41	1.0	83807.86	1	0	1	112542.58	0
2	3	15619304	Onio	502	France	Female	42	8.0	159660.80	3	1	0	113931.57	1
3	4	15701354	Boni	699	France	Female	39	1.0	0.00	2	0	0	93826.63	0
4	5	15737888	Mitchell	850	Spain	Female	43	2.0	125510.82	1	1	1	79084.10	0

	RowNumber	CustomerId	CreditScore	Age	Tenure	Balance	NumOfProducts	HasCrCard	IsActiveMember	EstimatedSalary	Exited
count	10000.00000	1.000000e+04	10000.000000	10000.000000	9091.000000	10000.000000	10000.000000	10000.00000	10000.000000	10000.000000	10000.000000
mean	5000.50000	1.569094e+07	650.528800	38.921800	4.997690	76485.889288	1.530200	0.70550	0.515100	100090.239881	0.203700
std	2886.89568	7.193619e+04	96.653299	10.487806	2.894723	62397.405202	0.581654	0.45584	0.499797	57510.492818	0.402769
min	1.00000	1.556570e+07	350.000000	18.000000	0.000000	0.000000	1.000000	0.00000	0.000000	11.580000	0.000000
25%	2500.75000	1.562853e+07	584.000000	32.000000	2.000000	0.000000	1.000000	0.00000	0.000000	51002.110000	0.000000
50%	5000.50000	1.569074e+07	652.000000	37.000000	5.000000	97198.540000	1.000000	1.00000	1.000000	100193.915000	0.000000
75%	7500.25000	1.575323e+07	718.000000	44.000000	7.000000	127644.240000	2.000000	1.00000	1.000000	149388.247500	0.000000
max	10000.00000	1.581569e+07	850.000000	92.000000	10.000000	250898.090000	4.000000	1.00000	1.000000	199992.480000	1.000000

	RowNumber	CustomerId	CreditScore	Age	Tenure	Balance	NumOfProducts	HasCrCard	IsActiveMember	EstimatedSalary	Exited
RowNumber	1.000000	0.004202	0.005840	0.000783	-0.007322	-0.009067	0.007246	0.000599	0.012044	-0.005988	-0.016571
CustomerId	0.004202	1.000000	0.005308	0.009497	-0.021418	-0.012419	0.016972	-0.014025	0.001665	0.015271	-0.006248
CreditScore	0.005840	0.005308	1.000000	-0.003965	-0.000062	0.006268	0.012238	-0.005458	0.025651	-0.001384	-0.027094
Age	0.000783	0.009497	-0.003965	1.000000	-0.013134	0.028308	-0.030680	-0.011721	0.085472	-0.007201	0.285323
Tenure	-0.007322	-0.021418	-0.000062	-0.013134	1.000000	-0.007911	0.011979	0.027232	-0.032178	0.010520	-0.016761
Balance	-0.009067	-0.012419	0.006268	0.028308	-0.007911	1.000000	-0.304180	-0.014858	-0.010084	0.012797	0.118533
NumOfProducts	0.007246	0.016972	0.012238	-0.030680	0.011979	-0.304180	1.000000	0.003183	0.009612	0.014204	-0.047820
HasCrCard	0.000599	-0.014025	-0.005458	-0.011721	0.027232	-0.014858	0.003183	1.000000	-0.011866	-0.009933	-0.007138
IsActiveMember	0.012044	0.001665	0.025651	0.085472	-0.032178	-0.010084	0.009612	-0.011866	1.000000	-0.011421	-0.156128
EstimatedSalary	-0.005988	0.015271	-0.001384	-0.007201	0.010520	0.012797	0.014204	-0.009933	-0.011421	1.000000	0.012097
Exited	-0.016571	-0.006248	-0.027094	0.285323	-0.016761	0.118533	-0.047820	-0.007138	-0.156128	0.012097	1.000000

	credit_score	age	tenure	balance	num_of_products	has_cr_card	is_active_member	estimated_salary	geography	gender
7143	0.703732	1.432882	0.713391	-1.217433	0.790787	1	1	0.442364	0.0	0.0
1730	2.046232	-1.811595	0.713391	-1.217433	0.790787	1	0	-1.183483	0.0	1.0
5253	0.047853	-1.143614	1.436178	-1.217433	0.790787	0	0	0.146972	2.0	0.0
1601	1.882263	0.955753	1.074784	1.346823	0.790787	1	0	0.515762	0.0	0.0
1034	0.129838	0.669475	-0.732183	-1.217433	0.790787	0	1	1.330867	0.0	1.0
...	...	...	...	...	...	...	...	...	...	...
1702	0.365544	-1.620743	-0.009396	-1.217433	0.790787	1	0	1.030824	2.0	1.0
4201	0.181079	0.669475	0.713391	1.366045	-0.920256	1	1	1.274958	1.0	0.0
4976	1.820774	-1.334466	0.713391	1.087362	-0.920256	1	0	0.296202	1.0	0.0
9771	-1.745563	0.001495	-0.009396	0.724523	-0.920256	1	1	-0.296458	2.0	1.0
8307	-0.331326	-0.857337	-0.732183	0.919591	-0.920256	1	1	-1.181434	0.0	1.0

Beta Bank Churn Model ¶

Purpose ¶

Hypothesis ¶

Read Data ¶

One Hot Encoding ¶

Splitting Data ¶

Feature Scaling ¶

Upsampling ¶

Modeling ¶

Decision Tree ¶

Logistic Regression ¶

Random Forest ¶

Final Model ¶

AUC ROC ¶

Overall Conclusions ¶