# !pip install --user -U plotly_express


# import necessary libraries

import pandas as pd 
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import mean_squared_error as mse
from sklearn.metrics import classification_report
from sklearn import metrics
import plotly_express as px
import plotly.graph_objects as go


# Read dataframe
df = pd.read_csv('datasets/users_behavior.csv')


# Look at dataframe
df


# Change column types to integer
df.calls = df.calls.astype('int')
df.messages = df.messages.astype('int')


# Confirm data type change, look at data summary
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3214 entries, 0 to 3213
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   calls     3214 non-null   int32  
 1   minutes   3214 non-null   float64
 2   messages  3214 non-null   int32  
 3   mb_used   3214 non-null   float64
 4   is_ultra  3214 non-null   int64  
dtypes: float64(2), int32(2), int64(1)
memory usage: 100.6 KB


# Ensure no missing values 
df.isna().sum()

calls       0
minutes     0
messages    0
mb_used     0
is_ultra    0
dtype: int64


# Ensure no duplicates
df.duplicated().sum()

0


# Values counts of each plan, 1 is Ultra
df.is_ultra.value_counts()

0    2229
1     985
Name: is_ultra, dtype: int64


# Splitting dataset into 3 
features = df.drop(['is_ultra'], axis=1)
target = df['is_ultra']

features_train, features_test, target_train, target_test = train_test_split( 
    features, target, test_size=0.2, random_state=19) # split 20% of data to make validation set
features_train, features_valid, target_train, target_valid = train_test_split(
    features_train, target_train, test_size=0.25, random_state=19) # 0.25 x 0.8 = 0.2


# Visual of the split data
print(features_train.shape)
print(target_train.shape)
print(features_valid.shape)
print(target_valid.shape)
print(features_test.shape)
print(target_test.shape)

(1928, 4)
(1928,)
(643, 4)
(643,)
(643, 4)
(643,)


# Decision Tree with loop for depth
best_model = None
best_result = 0
best_depth = 0
for depth in range(1,101):
    model = DecisionTreeClassifier(random_state=19, max_depth=depth)
    model.fit(features_train, target_train)
    predictions_valid = model.predict(features_valid)
    result = accuracy_score(target_valid, predictions_valid) ** 0.5 # calculate RMSE on validation set
    if result > best_result:
        best_model = model
        best_result = result
        best_depth = depth  

        
print("Best Depth:", best_depth, ","   "  Accuracy of the best model on the validation set:", best_result)

Best Depth: 8 ,  Accuracy of the best model on the validation set: 0.8870942657868514


# Accuracy of decision tree
train_predictions = model.predict(features)
valid_predictions = model.predict(features_valid)
print('Accuracy')
print('Training set:', accuracy_score(target, train_predictions))
print('Validation set:', accuracy_score(target_valid, valid_predictions))

Accuracy
Training set: 0.8864343497199751
Validation set: 0.71850699844479


# Logistic Regression
model3 = LogisticRegression(random_state=19, solver='liblinear')  
model3.fit(features_train, target_train)  
score_train = model3.score(features_train, target_train)  
score_valid = model3.score(features_valid, target_valid)


# Accuracy 
train_predictions3 = model3.predict(features)
valid_predictions3 = model3.predict(features_valid)
print('Accuracy')
print('Training set:', accuracy_score(target, train_predictions3))
print('Validation set:', accuracy_score(target_valid, valid_predictions3))

Accuracy
Training set: 0.7417548226509023
Validation set: 0.7107309486780715


# Random Forest
best_model = None
best_result = 0
best_est = 0
best_depth1 = 0
for est in range(10, 101, 10):
    for depth in range (1, 101):
        model1 = RandomForestClassifier(random_state=19, n_estimators=est, max_depth=depth)
        model1.fit(features_train, target_train) # train model on training set
        predictions_valid1 = model1.predict(features_valid) # get model predictions on validation set
        result1 = accuracy_score(target_valid, predictions_valid1) ** 0.5 # calculate RMSE on validation set
        if result1 > best_result:
            best_model = model1
            best_result = result1
            best_est = est
            best_depth1 = depth

print("Accuracy of the best model on the validation set:", best_result, "n_estimators:", best_est, "best_depth:", best_depth1)

final_model = RandomForestClassifier(random_state=19, n_estimators=best_est, max_depth=best_depth1) # change n_estimators to get best model
final_model.fit(features_train, target_train)

Accuracy of the best model on the validation set: 0.8984174785618215 n_estimators: 40 best_depth: 13

RandomForestClassifier(max_depth=13, n_estimators=40, random_state=19)


# Model Parameters
final_model.get_params()

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': 13,
 'max_features': 'auto',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 40,
 'n_jobs': None,
 'oob_score': False,
 'random_state': 19,
 'verbose': 0,
 'warm_start': False}


# Accuracy 
train_predictions1 = final_model.predict(features)
valid_predictions1 = final_model.predict(features_valid)
print('Accuracy')
print('Training set:', accuracy_score(target, train_predictions1))
print('Validation set:', accuracy_score(target_valid, valid_predictions1))

Accuracy
Training set: 0.8820784069695085
Validation set: 0.807153965785381


# Overall Accuracy 
test_predictions1 = final_model.predict(features_test)
print('Accuracy')
print('Test set:', accuracy_score(target_test, test_predictions1))

Accuracy
Test set: 0.80248833592535


# Null Accuracy
max(target_test.mean(), 1 - target_test.mean())

0.687402799377916


# Confusion Matrix
conf_matrix = metrics.confusion_matrix(target_test, test_predictions1)
conf_matrix

array([[399,  43],
       [ 84, 117]], dtype=int64)


# Confusion Matrix Figure
fig = px.imshow(conf_matrix, text_auto=True, labels=dict(y="Actual", x="Predicted"),
                x=['Not Ultra', 'Is Ultra'],
                y=['Not Ultra', 'Is Ultra'], title='Confusion Matrix')

fig.show()


fig = go.Figure(data=go.Heatmap(z=[[81, 120], [399, 43]], text=[['False Negatives', 'True Positives'], ['True Negatives', 'False Positives']], 
                texttemplate="%{text}", textfont={"size":20}, x=['Not Ultra', 'Is Ultra'],
                y=['Not Ultra', 'Is Ultra']))

fig.show()


# Sensitivity 
# TP / TP + FN
print('The sensitivity is:', metrics.recall_score(target_test, test_predictions1) * 100, '%')

The sensitivity is: 58.2089552238806 %


# Specificity
# TN / (TN + FP)
print('The specificity is:', 399 / float(399 + 43)* 100, '%')

The specificity is: 90.27149321266968 %


# False Positive Rate
# FP / (TN + FP)
print('The false positives are:', 43 / float(399 + 43)* 100, '%')

The false positives are: 9.728506787330318 %


# Precision 
# TP / (TP + FP)
print('The precision is:', metrics.precision_score(target_test, test_predictions1)*100, '%')

The precision is: 73.125 %


# Classification report
print(classification_report(target_test, test_predictions1))

              precision    recall  f1-score   support

           0       0.83      0.90      0.86       442
           1       0.73      0.58      0.65       201

    accuracy                           0.80       643
   macro avg       0.78      0.74      0.76       643
weighted avg       0.80      0.80      0.80       643


# Separating prediction probabilities for both plans
is_ultra = (model1.predict_proba(features_test)*100)[:, 1]
not_ultra = (model1.predict_proba(features_test)*100)[:, 0]


# Displaying histogram for prediction probabilities of both plans
fig = go.Figure()
fig.add_trace(go.Histogram(x=not_ultra, name='Not Ultra'))
fig.add_trace(go.Histogram(x=is_ultra, name='Is Ultra'))

# Overlay both histograms
fig.update_layout(barmode='overlay')
# Reduce opacity to see both histograms
fig.update_traces(opacity=0.75)
fig.update_layout(
    title_text='Distribution of Ultra vs Not Ultra', 
    xaxis_title_text='Prediction Percent Confidence', 
    yaxis_title_text='Count', 
)    
fig.show()

	calls	minutes	messages	mb_used	is_ultra
0	40.0	311.90	83.0	19915.42	0
1	85.0	516.75	56.0	22696.96	0
2	77.0	467.66	86.0	21060.45	0
3	106.0	745.53	81.0	8437.39	1
4	66.0	418.74	1.0	14502.75	0
...	...	...	...	...	...
3209	122.0	910.98	20.0	35124.90	1
3210	25.0	190.36	0.0	3275.61	0
3211	97.0	634.44	70.0	13974.06	0
3212	64.0	462.32	90.0	31239.78	0
3213	80.0	566.09	6.0	29480.52	1

Megaline Plan Model ¶

Purpose ¶

Initial Planning ¶

Read Data ¶

Models ¶

Decision Tree ¶

Results ¶

Logistic Regression ¶

Results ¶

Random Forest ¶

Model Analysis ¶

Results ¶

Overall Conclusions ¶