# pip install scikit-learn --upgrade


# import libraries
import numpy as np
import pandas as pd
import math
import seaborn as sns
import sklearn.linear_model
import sklearn.metrics
from sklearn.metrics import f1_score
import sklearn.neighbors
from sklearn.neighbors import KNeighborsClassifier
import sklearn.preprocessing
from sklearn.preprocessing import Binarizer
from sklearn.model_selection import train_test_split, cross_val_score
from IPython.display import display
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, roc_auc_score, roc_curve, auc, r2_score, mean_squared_error as mse
from sklearn.utils import shuffle
import plotly_express as px
import plotly.graph_objects as go 
from sklearn.metrics import classification_report
from sklearn.neighbors import NearestNeighbors


# read dataframe
df = pd.read_csv('datasets/insurance_us.csv')


# change column names
df = df.rename(columns={'Gender': 'gender', 'Age': 'age', 'Salary': 'income', 'Family members': 'family_members', 'Insurance benefits': 'insurance_benefits'})


# look at the data
df.sample(10)


# info on columns
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 5 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   gender              5000 non-null   int64  
 1   age                 5000 non-null   float64
 2   income              5000 non-null   float64
 3   family_members      5000 non-null   int64  
 4   insurance_benefits  5000 non-null   int64  
dtypes: float64(2), int64(3)
memory usage: 195.4 KB


# we may want to fix the age type (from float to int) though this is not critical

# write your conversion here if you choose:
df.age = df.age.astype('int')
df.income = df.income.astype('int')


# check to see that the conversion was successful
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 5 columns):
 #   Column              Non-Null Count  Dtype
---  ------              --------------  -----
 0   gender              5000 non-null   int64
 1   age                 5000 non-null   int32
 2   income              5000 non-null   int32
 3   family_members      5000 non-null   int64
 4   insurance_benefits  5000 non-null   int64
dtypes: int32(2), int64(3)
memory usage: 156.4 KB


# looking for missing values
df.isna().sum()

gender                0
age                   0
income                0
family_members        0
insurance_benefits    0
dtype: int64


# looking for duplicates
df[['age','gender', 'income', 'family_members', 'insurance_benefits']].duplicated().sum()

153


# now have a look at the data's descriptive statistics. 
# Does everything look okay?
df.describe()


# correlations 
df.corr()


# data skew
df.skew()

gender                0.004001
age                   0.515148
income               -0.036724
family_members        0.898297
insurance_benefits    3.845707
dtype: float64


# correlation matrix
px.imshow(df.corr(), title='Correlation Matrix', text_auto=True, height=900, 
    template='ggplot2')


# scatter matrix
fig = px.scatter_matrix(df, height=800, title='Scatter Matrix')
fig.update_traces(showupperhalf=False, diagonal_visible=False)


# distribution of age
px.histogram(df.age, title=' Distribution of Age', template='ggplot2', height=800, labels={'value': 'Age'})


# distribution of income
px.histogram(df.income, title='Distribution of Income', template='plotly_dark', height=800, labels={'value': 'Salary'})


# distribution of family members
px.histogram(df.family_members, title='Distribution of Family Members', template='seaborn', height=800, labels={'value': 'Number of Family Members'})


# distribution of gender
px.bar(df.gender.value_counts(), color_discrete_sequence=[['pink', 'blue']], labels={'index': 'Gender', 'value': 'Count'}, title='Distribution of Gender', height=800)


# distribution of insurance benefits
px.histogram(df.insurance_benefits,  labels={'value': 'Number of Benefits'}, title='Distribution of Insurance Benefits', height=800, template='none')


# euclidean
feature_names = ['gender', 'age', 'income', 'family_members']
nbrs = NearestNeighbors(metric='euclidean')
nbrs.fit(df[feature_names])
nbrs_distances, nbrs_indices = nbrs.kneighbors([df.iloc[1][feature_names]], n_neighbors=5, return_distance=True)
df_res = pd.concat([df.iloc[nbrs_indices[0]], pd.DataFrame(nbrs_distances.T, index=nbrs_indices[0], columns=['distance'])], axis=1)

C:\Users\XIX\anaconda3\lib\site-packages\sklearn\base.py:450: UserWarning:

X does not have valid feature names, but NearestNeighbors was fitted with feature names


# euclidean
df_res.head()


# Manhattan 
feature_names = ['gender', 'age', 'income', 'family_members']
nbrs = NearestNeighbors(metric='manhattan')
nbrs.fit(df[feature_names])
nbrs_distances, nbrs_indices = nbrs.kneighbors([df.iloc[1][feature_names]], n_neighbors=5, return_distance=True)
df_res = pd.concat([df.iloc[nbrs_indices[0]], pd.DataFrame(nbrs_distances.T, index=nbrs_indices[0], columns=['distance'])], axis=1)

C:\Users\XIX\anaconda3\lib\site-packages\sklearn\base.py:450: UserWarning:

X does not have valid feature names, but NearestNeighbors was fitted with feature names


# manhattan
df_res.head()


# scaling numerical columns
feature_names = ['gender', 'age', 'income', 'family_members']

transformer_mas = sklearn.preprocessing.MaxAbsScaler().fit(df[feature_names].to_numpy())

df_scaled = df.copy()
df_scaled.loc[:, feature_names] = transformer_mas.transform(df[feature_names].to_numpy())


# look at scaled data
df_scaled.sample(5)


# euclidean
feature_names = ['gender', 'age', 'income', 'family_members']
nbrs2 = NearestNeighbors(metric='euclidean')
nbrs2.fit(df_scaled[feature_names])
nbrs_distances2, nbrs_indices2 = nbrs2.kneighbors([df_scaled.iloc[1][feature_names]], n_neighbors=5, return_distance=True)
df_res2 = pd.concat([df_scaled.iloc[nbrs_indices2[0]], pd.DataFrame(nbrs_distances2.T, index=nbrs_indices2[0], columns=['distance'])], axis=1)

C:\Users\XIX\anaconda3\lib\site-packages\sklearn\base.py:450: UserWarning:

X does not have valid feature names, but NearestNeighbors was fitted with feature names


# euclidean
df_res2.head()


# manhattan
feature_names = ['gender', 'age', 'income', 'family_members']
nbrs2 = NearestNeighbors(metric='manhattan')
nbrs2.fit(df_scaled[feature_names])
nbrs_distances2, nbrs_indices2 = nbrs2.kneighbors([df_scaled.iloc[1][feature_names]], n_neighbors=5, return_distance=True)
df_res2 = pd.concat([df_scaled.iloc[nbrs_indices2[0]], pd.DataFrame(nbrs_distances2.T, index=nbrs_indices2[0], columns=['distance'])], axis=1)

C:\Users\XIX\anaconda3\lib\site-packages\sklearn\base.py:450: UserWarning:

X does not have valid feature names, but NearestNeighbors was fitted with feature names


# manhattan
df_res2.head()


# look at data
df.head()


# binarize target with threshold of 0.5, drop old target column
binarizer = Binarizer(threshold=0.5)
df['insurance_benefits_received'] = binarizer.fit_transform(df[['insurance_benefits']])


# probability of insurance benefit received
df['insurance_benefits_received'].sum() / len(df)

0.1128


# check for the class imbalance with value_counts()
df.insurance_benefits_received.value_counts()

0    4436
1     564
Name: insurance_benefits_received, dtype: int64


# unscaled data
X = df[['age', 'gender', 'income', 'family_members']]
y = df['insurance_benefits_received']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=19)


# unscaled knn model
neighbors = np.arange(1,11)
test_accuracies = {}

for neighbor in neighbors:
    knn = KNeighborsClassifier(n_neighbors=neighbor)
    knn.fit(X_train, y_train)
    predicted_test = knn.predict(X_test)
    test_accuracies[neighbor] = f1_score(y_test, predicted_test)
    
print(neighbors, '\n', test_accuracies)

[ 1  2  3  4  5  6  7  8  9 10] 
 {1: 0.6222222222222223, 2: 0.4184100418410042, 3: 0.43243243243243246, 4: 0.22935779816513763, 5: 0.23529411764705882, 6: 0.10256410256410257, 7: 0.10050251256281409, 8: 0.0, 9: 0.010752688172043012, 10: 0.0}


# creating a dataframe for accuracies, transposed
accuracies = pd.DataFrame([test_accuracies.values()], columns=test_accuracies.keys()).T


# grapph of train and test accuracies
px.line(accuracies, title='KNN: Varying Number of Neighbors', labels={'index': 'Number of Neighbors', 'value': 'F1_score'})


# binarize target with threshold of 0.5, drop old target column
binarizer = Binarizer(threshold=0.5)
df_scaled['insurance_benefits_received'] = binarizer.fit_transform(df_scaled[['insurance_benefits']])


# scaled data
X_scaled = df_scaled[['age', 'gender', 'income', 'family_members']]
y_scaled = df_scaled['insurance_benefits_received']

X_scaled_train, X_scaled_test, y_scaled_train, y_scaled_test = train_test_split(X_scaled, y_scaled, test_size=0.3, random_state=19)


# unscaled knn model
neighbors = np.arange(1,11)

test_accuracies = {}

for neighbor in neighbors:
    knn_scaled = KNeighborsClassifier(n_neighbors=neighbor)
    knn_scaled.fit(X_scaled_train, y_scaled_train)
    predicted_test_scaled = knn_scaled.predict(X_scaled_test)
    test_accuracies[neighbor] = f1_score(y_scaled_test, predicted_test_scaled)
    
print(neighbors, '\n', test_accuracies)

[ 1  2  3  4  5  6  7  8  9 10] 
 {1: 0.9497206703910615, 2: 0.8961424332344213, 3: 0.9178470254957507, 4: 0.880952380952381, 5: 0.9273743016759777, 6: 0.8802395209580838, 7: 0.8979591836734695, 8: 0.8734939759036144, 9: 0.9005847953216374, 10: 0.876876876876877}


# creating a dataframe for accuracies, transposed
accuracies = pd.DataFrame([ test_accuracies.values()], columns=test_accuracies.keys()).T


# grapph of train and test accuracies
px.line(accuracies, title='KNN: Varying Number of Neighbors', labels={'index': 'Number of Neighbors', 'value': 'F1 Score'})


# function for classifier evaluation
def eval_classifier(y_true, y_pred):
    
    f1_score = sklearn.metrics.f1_score(y_true, y_pred)
    print(f'F1: {f1_score:.2f}')
    
# if you have an issue with the following line, restart the kernel and run the notebook again
    cm = sklearn.metrics.confusion_matrix(y_true, y_pred, normalize='all')
    print('Confusion Matrix')
    print(cm)


# generating output of a random model

def rnd_model_predict(P, size, seed=42):

    rng = np.random.default_rng(seed=seed)
    return rng.binomial(n=1, p=P, size=size)


# probabilities
for P in [0, df['insurance_benefits_received'].sum() / len(df), 0.5, 1]:

    print(f'The probability: {P:.2f}')
    y_pred_rnd = rnd_model_predict(P, 5000)
        
    eval_classifier(df['insurance_benefits_received'], y_pred_rnd)
    
    print()

The probability: 0.00
F1: 0.00
Confusion Matrix
[[0.8872 0.    ]
 [0.1128 0.    ]]

The probability: 0.11
F1: 0.12
Confusion Matrix
[[0.7914 0.0958]
 [0.0994 0.0134]]

The probability: 0.50
F1: 0.20
Confusion Matrix
[[0.456  0.4312]
 [0.053  0.0598]]

The probability: 1.00
F1: 0.20
Confusion Matrix
[[0.     0.8872]
 [0.     0.1128]]


# creating linear regression algorithm 
class MyLinearRegression:
    
    def __init__(self):
        
        self.weights = None
    
    def fit(self, X, y):
        
        # adding the unities
        X2 = np.append(np.ones([len(X), 1]), X, axis=1)
        self.weights = np.linalg.inv(X2.T @ X2) @ X2.T @ y

    def predict(self, X):
        
        # adding the unities
        X2 = np.append(np.ones([len(X), 1]), X, axis=1)
        y_pred = X2 @ self.weights
        
        return y_pred


# evaluation for regressor algorithm
def eval_regressor(y_true, y_pred):
    
    rmse = math.sqrt(sklearn.metrics.mean_squared_error(y_true, y_pred))
    print(f'RMSE: {rmse:.2f}')
    
    r2_score = math.sqrt(sklearn.metrics.r2_score(y_true, y_pred))
    print(f'R2: {r2_score:.2f}')


# Running regression model on original data
X = df[['age', 'gender', 'income', 'family_members']].to_numpy()
y = df['insurance_benefits'].to_numpy()

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=12345)

lr = MyLinearRegression()

lr.fit(X_train, y_train)
print(lr.weights)

y_test_pred = lr.predict(X_test)
eval_regressor(y_test, y_test_pred)

[-9.43538930e-01  3.57495491e-02  1.64272730e-02 -2.60745684e-07
 -1.16902138e-02]
RMSE: 0.34
R2: 0.66


# Running regression model on scaled data
X_scaled = df_scaled[['age', 'gender', 'income', 'family_members']].to_numpy()
y_scaled = df_scaled['insurance_benefits'].to_numpy()

X_train_scaled, X_test_scaled, y_train_scaled, y_test_scaled = train_test_split(X_scaled, y_scaled, test_size=0.3, random_state=12345)

lr_scaled = MyLinearRegression()

lr_scaled.fit(X_train_scaled, y_train_scaled)
print(lr_scaled.weights)

y_test_pred_scaled = lr_scaled.predict(X_test_scaled)
eval_regressor(y_test_scaled, y_test_pred_scaled)

[-0.94353893  2.32372069  0.01642727 -0.02059891 -0.07014128]
RMSE: 0.34
R2: 0.66


# data to be obfuscated
personal_info_column_list = ['gender', 'age', 'income', 'family_members']
df_pn = df[personal_info_column_list]


# convert data to numpy array
X = df_pn.to_numpy()


# look at the array
X

array([[    1,    41, 49600,     1],
       [    0,    46, 38000,     1],
       [    0,    29, 21000,     0],
       ...,
       [    0,    20, 33900,     2],
       [    1,    22, 32700,     3],
       [    1,    28, 40600,     1]], dtype=int64)


# random factor P
rng = np.random.default_rng(seed=42)
P = rng.random(size=(X.shape[1], X.shape[1]))


# visual of P
P

array([[0.77395605, 0.43887844, 0.85859792, 0.69736803],
       [0.09417735, 0.97562235, 0.7611397 , 0.78606431],
       [0.12811363, 0.45038594, 0.37079802, 0.92676499],
       [0.64386512, 0.82276161, 0.4434142 , 0.22723872]])


# inverse of P
P_inv = np.linalg.inv(P)
P_inv

array([[ 0.41467992, -1.43783972,  0.62798546,  1.14001268],
       [-1.06101789,  0.44219337,  0.1329549 ,  1.18425933],
       [ 1.42362442,  1.60461607, -2.0553823 , -1.53699695],
       [-0.11128575, -0.65813802,  1.74995517, -0.11816316]])


# new obfuscated dataset
X_new = X @ P
X_new

array([[ 6359.71527314, 22380.40467609, 18424.09074184, 46000.69669016],
       [ 4873.29406479, 17160.36702982, 14125.78076133, 35253.45577301],
       [ 2693.11742928,  9486.397744  ,  7808.83156024, 19484.86063067],
       ...,
       [ 4346.2234249 , 15289.24126492, 12586.16264392, 31433.50888552],
       [ 4194.09324155, 14751.9910242 , 12144.02930637, 30323.88763426],
       [ 5205.46827354, 18314.24814446, 15077.01370762, 37649.59295455]])


# product of P and X
P @ X.T

array([[42605.92216771, 32647.60673289, 18043.28379289, ...,
        29116.64178985, 28088.67336691, 34872.83546879],
       [37793.40997679, 28968.97336811, 16012.22678999, ...,
        25823.72047312, 24913.18431708, 30930.46956831],
       [18411.10270401, 14111.96943897,  7799.81970108, ...,
        12580.91427022, 12137.91229164, 15068.06546873],
       [22027.94859182, 16887.81382837,  9335.55826216, ...,
        15048.65104996, 14519.07063843, 18026.5249014 ]])


# product of inverse of P and X_new
X_new @ P_inv

array([[ 1.00000000e+00,  4.10000000e+01,  4.96000000e+04,
         1.00000000e+00],
       [-3.63797881e-12,  4.60000000e+01,  3.80000000e+04,
         1.00000000e+00],
       [ 1.81898940e-12,  2.90000000e+01,  2.10000000e+04,
         0.00000000e+00],
       ...,
       [ 0.00000000e+00,  2.00000000e+01,  3.39000000e+04,
         2.00000000e+00],
       [ 1.00000000e+00,  2.20000000e+01,  3.27000000e+04,
         3.00000000e+00],
       [ 1.00000000e+00,  2.80000000e+01,  4.06000000e+04,
         1.00000000e+00]])


# original 
X

array([[    1,    41, 49600,     1],
       [    0,    46, 38000,     1],
       [    0,    29, 21000,     0],
       ...,
       [    0,    20, 33900,     2],
       [    1,    22, 32700,     3],
       [    1,    28, 40600,     1]], dtype=int64)


# transformed
X @ P

array([[ 6359.71527314, 22380.40467609, 18424.09074184, 46000.69669016],
       [ 4873.29406479, 17160.36702982, 14125.78076133, 35253.45577301],
       [ 2693.11742928,  9486.397744  ,  7808.83156024, 19484.86063067],
       ...,
       [ 4346.2234249 , 15289.24126492, 12586.16264392, 31433.50888552],
       [ 4194.09324155, 14751.9910242 , 12144.02930637, 30323.88763426],
       [ 5205.46827354, 18314.24814446, 15077.01370762, 37649.59295455]])


# recovered 
X_recovered = P_inv @ X_new.T
X_recovered

array([[ 34469.24991267,  26407.17129216,  14593.68305532, ...,
         23557.33895772,  22724.09776049,  28214.8083902 ],
       [ 60075.02241217,  46044.87413318,  25450.72689052, ...,
         41048.19857736,  39599.17981236,  49166.78883559],
       [-63605.67724923, -48744.79237922, -26942.29140034, ...,
        -43461.83096724, -41926.25007333, -52058.36776964],
       [ 11368.60389364,   8717.60489413,   4819.64768036, ...,
          7764.83369297,   7492.75126654,   9302.7054582 ]])


# original data
X = df[['age', 'gender', 'income', 'family_members']].to_numpy()
y = df['insurance_benefits'].to_numpy()

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=19)

linreg = MyLinearRegression()

linreg.fit(X_train, y_train)
print(linreg.weights)

y_test_pred = linreg.predict(X_test)
eval_regressor(y_test, y_test_pred)

[-9.21298248e-01  3.48958441e-02 -9.02018203e-03  1.92322314e-07
 -1.30438947e-02]
RMSE: 0.37
R2: 0.66


# Obfuscated data
# X_new
# y = df['insurance_benefits'].to_numpy()

X_train2, X_test2, y_train2, y_test2 = train_test_split(X_new, y, test_size=0.3, random_state=19)

linreg2 = MyLinearRegression()

linreg2.fit(X_train2, y_train2)
print(linreg2.weights)

y_test_pred2 = linreg2.predict(X_test2)
eval_regressor(y_test2, y_test_pred2)

[-0.92129821 -0.0687852   0.00955393  0.06320114 -0.02042082]
RMSE: 0.37
R2: 0.66

	gender	age	income	family_members
3213	0	22.0	35400.0	1
4851	0	31.0	39200.0	0
2587	1	26.0	59200.0	0
4581	1	41.0	38400.0	1
3940	0	25.0	46800.0	2
4394	0	38.0	36800.0	1
2641	0	32.0	28300.0	1
46	0	26.0	34500.0	1
3731	1	31.0	57100.0	2
1178	1	18.0	58400.0	2

	gender	age	income	family_members	insurance_benefits
count	5000.000000	5000.000000	5000.000000	5000.000000	5000.000000
mean	0.499000	30.952800	39916.359400	1.194200	0.148000
std	0.500049	8.440807	9900.082063	1.091387	0.463183
min	0.000000	18.000000	5300.000000	0.000000	0.000000
25%	0.000000	24.000000	33300.000000	0.000000	0.000000
50%	0.000000	30.000000	40200.000000	1.000000	0.000000
75%	1.000000	37.000000	46600.000000	2.000000	0.000000
max	1.000000	65.000000	79000.000000	6.000000	5.000000

	gender	age	income	family_members	insurance_benefits
gender	1.000000	0.002074	0.014910	-0.008991	0.010140
age	0.002074	1.000000	-0.019093	-0.006692	0.651030
income	0.014910	-0.019093	1.000000	-0.030296	-0.014963
family_members	-0.008991	-0.006692	-0.030296	1.000000	-0.036290
insurance_benefits	0.010140	0.651030	-0.014963	-0.036290	1.000000

	age	income	family_members	insurance_benefits	distance
1	0.707692	0.481013	0.166667	1	0.000000
4162	0.707692	0.477215	0.166667	1	0.003797
1863	0.707692	0.492405	0.166667	1	0.011392
4986	0.723077	0.491139	0.166667	1	0.018418
4477	0.692308	0.459494	0.166667	1	0.026453

	age	income	family_members	insurance_benefits	distance
1	0.707692	0.481013	0.166667	1	0.000000
4162	0.707692	0.477215	0.166667	1	0.003797
1863	0.707692	0.492405	0.166667	1	0.011392
4986	0.723077	0.491139	0.166667	1	0.025511
2434	0.676923	0.482278	0.166667	1	0.032035

The Sure Tomorrow Insurance Company ¶

Data Preprocessing & Exploration ¶

Initialization ¶

Load Data ¶

Conclusions ¶

EDA ¶

Task 1. Similar Customers ¶

The Sure Tomorrow Insurance Company ¶

Task 2. Is Customer Likely to Receive Insurance Benefit?¶

Unscaled ¶

Conclusions ¶

Scaled ¶

Conclusions ¶

Dummy Model ¶

Conclusion ¶

Task 3. Regression (with Linear Regression)¶

Original data ¶

Scaled Data ¶

Conclusions ¶

Task 4. Obfuscating Data ¶

Test Linear Regression With Data Obfuscation ¶

Original ¶

Obfuscated ¶

Conclusion ¶

Final Conclusions ¶

	gender	age	income	family_members	insurance_benefits	distance
1	0	46	38000	1	1	0.000000
3920	0	40	38000	0	0	6.082763
4948	1	37	38000	1	0	9.055385
2528	1	36	38000	0	0	10.099505
3593	0	33	38000	0	0	13.038405

	gender	age	income	family_members	insurance_benefits
1161	0.0	0.292308	0.494937	0.5	0
1921	1.0	0.492308	0.507595	0.0	0
2074	1.0	0.307692	0.369620	0.0	0
1181	0.0	0.692308	0.358228	0.0	1
3271	1.0	0.615385	0.736709	0.0	0