# Import Libraries
import pandas as pd 
import numpy as np 
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import mean_squared_error as mse, r2_score
from sklearn.linear_model import LinearRegression
import numpy as np
from scipy import stats as st


# Read datasets 
df0 = pd.read_csv('datasets/geo_data_0.csv')
df1 = pd.read_csv('datasets/geo_data_1.csv')
df2 = pd.read_csv('datasets/geo_data_2.csv')


# Shape of datasets 
print(df0.shape)
print(df1.shape)
print(df2.shape)

(100000, 5)
(100000, 5)
(100000, 5)


# Check for missing values
print(df0.isnull().sum())
print()
print(df1.isnull().sum())
print()
print(df2.isnull().sum())

id         0
f0         0
f1         0
f2         0
product    0
dtype: int64

id         0
f0         0
f1         0
f2         0
product    0
dtype: int64

id         0
f0         0
f1         0
f2         0
product    0
dtype: int64


# Check for duplicates
print(df0.duplicated().sum())
print()
print(df1.duplicated().sum())
print()
print(df2.duplicated().sum())

0

0

0


# correlation between features and target
print('First dataset:')
print(df0.corr()['product'])
print()

# correlation between features and target
print('Second dataset:')
print( df1.corr()['product'])
print()

# correlation between features and target
print('Third dataset:')
print(df2.corr()['product'])

First dataset:
f0         0.143536
f1        -0.192356
f2         0.483663
product    1.000000
Name: product, dtype: float64

Second dataset:
f0        -0.030491
f1        -0.010155
f2         0.999397
product    1.000000
Name: product, dtype: float64

Third dataset:
f0        -0.001987
f1        -0.001012
f2         0.445871
product    1.000000
Name: product, dtype: float64


print(df0.id.nunique())
print(df1.id.nunique())
print(df2.id.nunique())

99990
99996
99996


# First dataset
target_0 = df0['product']
features_0 = df0.drop(['product', 'id'], axis=1)

# Second dataset
target_1 = df1['product']
features_1 = df1.drop(['product', 'id'], axis=1)

# Third dataset
target_2 = df2['product']
features_2 = df2.drop(['product', 'id'], axis=1)


# Random state variable
random_state = 19


# Splitting data into train and valid sets, 75/25 

# First dataset
features_train_0, features_valid_0, target_train_0, target_valid_0 = train_test_split(features_0, target_0, test_size=0.25, random_state=random_state)

# Second dataset
features_train_1, features_valid_1, target_train_1, target_valid_1 = train_test_split(features_1, target_1, test_size=0.25, random_state=random_state)

# Third dataset
features_train_2, features_valid_2, target_train_2, target_valid_2 = train_test_split(features_2, target_2, test_size=0.25, random_state=random_state)


# Shape of results
print(features_train_0.shape)
print(target_train_0.shape)
print(features_valid_0.shape)
print(target_valid_0.shape)

(75000, 3)
(75000,)
(25000, 3)
(25000,)


# Model Function
def linear_model(ft, tt, fv, tv, n): # features_train, target_train, features_valid, target_valid, dataset number
    
    model = LinearRegression()
    model.fit(ft, tt) # train model on training set
    predictions_valid = model.predict(fv) # get model predictions on validation set
    result = mse(tv, predictions_valid) ** 0.5 # calculate RMSE on validation set
           
    print('RMSE of the linear regression model on the validation set:', result)
    print('Dataset number :', (n))
    print('R2 score: ', r2_score(tv, predictions_valid))


# Dataset 0
linear_model(features_train_0, target_train_0, features_valid_0, target_valid_0, 0)

RMSE of the linear regression model on the validation set: 37.489176404931385
Dataset number : 0
R2 score:  0.2800427586322962


# Dataset 1
linear_model(features_train_1, target_train_1, features_valid_1, target_valid_1, 1)

RMSE of the linear regression model on the validation set: 0.8949747975866225
Dataset number : 1
R2 score:  0.9996221715647515


# Dataset 2
linear_model(features_train_2, target_train_2, features_valid_2, target_valid_2, 2)

RMSE of the linear regression model on the validation set: 40.03911711877307
Dataset number : 2
R2 score:  0.19763128804301888


# Model 0
model_0 = LinearRegression()
model_0.fit(features_train_0, target_train_0) # train model on training set

# Predictions for model 0
predictions_0 = model_0.predict(features_valid_0)
predictions_0 = pd.Series(predictions_0)

# Merged dataframe of target and predictions 
merged_0 = pd.concat([target_valid_0.reset_index(drop=True), predictions_0], axis=1)
merged_0.columns = ['target', 'predictions']


# Model 1
model_1 = LinearRegression()
model_1.fit(features_train_1, target_train_1) # train model on training set

# Predictions for model 1
predictions_1 = model_1.predict(features_valid_1)
predictions_1 = pd.Series(predictions_1)

# Merged dataframe of target and predictions 
merged_1 = pd.concat([target_valid_1.reset_index(drop=True), predictions_1], axis=1)
merged_1.columns = ['target', 'predictions']


# Model 2
model_2 = LinearRegression()
model_2.fit(features_train_2, target_train_2) # train model on training set

predictions_2 = model_2.predict(features_valid_2)
predictions_2 = pd.Series(predictions_2)

merged_2 = pd.concat([target_valid_2.reset_index(drop=True), predictions_2], axis=1)
merged_2.columns = ['target', 'predictions']


print('Mean Predictions 0:', predictions_0.mean(), '        ', 'Stdev Predictions 0:', predictions_0.std())
print('Mean Predictions 1:', predictions_1.mean(),'        ', 'Stdev Predictions 1:', predictions_1.std())
print('Mean Predictions 2:', predictions_2.mean(), '        ', 'Stdev Predictions 2:', predictions_2.std())

Mean Predictions 0: 92.42951694121892          Stdev Predictions 0: 23.25461037847274
Mean Predictions 1: 68.68102185200009          Stdev Predictions 1: 46.0429779067258
Mean Predictions 2: 94.91135196705469          Stdev Predictions 2: 19.79760332480081


# Comparing target and predictions of dataset 0 
merged_0.sum()

target         2.311242e+06
predictions    2.310738e+06
dtype: float64


# Comparing target and predictions of dataset 0 
merged_0.describe().T


# Comparing target and predictions of dataset 1
merged_1.sum()

target         1.716792e+06
predictions    1.717026e+06
dtype: float64


# Comparing target and predictions of dataset 1
merged_1.describe().T


# Comparing target and predictions of dataset 2
merged_2.sum()

target         2.376934e+06
predictions    2.372784e+06
dtype: float64


# Comparing target and predictions of dataset 2
merged_2.describe().T


# Cost per barrel
cpb = 4.5 

# budget
budget = 100000000

# Number of wells
wells = 200

# Factor for unit of barrels
factor = 1000

unit_cost = 4500

# Budget per well
bpw = budget / wells

# Budget per barrel
bpb = bpw / unit_cost

# Break even reserve volume
break_even = budget / (cpb * factor)

# Minimum volume to break even
min_product = break_even / wells


# Comparing mean volume with break even point
print('Volume in region 0 after break even: ', f'{merged_0.target.mean() - (min_product):,}', 'barrels')
print('Volume in region 1 after break even: ', f'{merged_1.target.mean() - (min_product):,}', 'barrels')
print('Volume in region 2 after break even: ', f'{merged_2.target.mean() - (min_product):,}', 'barrels')

Volume in region 0 after break even:  -18.66144208004802 barrels
Volume in region 1 after break even:  -42.43944890421423 barrels
Volume in region 2 after break even:  -16.033768694089275 barrels


# Function to determine profit predictions to target
def profit(target, predictions, count):
    predictions_sorted = predictions.sort_values(ascending=False)
    selected = target[predictions_sorted.index][:count]
    return (unit_cost * selected.sum()) - budget


print('Total profit from Region 0 :', '$', f'{profit(target_0, predictions_0, 200 ):,}')
print('Total profit from Region 1 :', '$', f'{profit(target_1, predictions_1, 200 ):,}')
print('Total profit from Region 2 :', '$', f'{profit(target_2, predictions_2, 200 ):,}')

Total profit from Region 0 : $ -16,617,291.92250064
Total profit from Region 1 : $ -37,881,648.63171914
Total profit from Region 2 : $ -16,543,855.132537901


# Function for confidence interval
def confidence(sample): 
    print('Mean:', sample.mean())
    confidence_interval = st.t.interval(0.95, len(sample)-1, sample.mean(), sample.sem())
    print('95% confidence interval:', confidence_interval)


confidence(predictions_0)

Mean: 92.42951694121892
95% confidence interval: (92.141241144103, 92.71779273833484)


confidence(predictions_1)

Mean: 68.68102185200009
95% confidence interval: (68.11025003765316, 69.25179366634701)


confidence(predictions_2)

Mean: 94.91135196705469
95% confidence interval: (94.66593096078726, 95.1567729733221)


# Function for determining profit, risk, and losses 
def profit_risk(target, predictions):
    state = np.random.RandomState(19)
    values = []
    for i in range(1000):
        target_subsample = target.sample(n=500, replace=True, random_state=state)
        pred_subsample = predictions[target_subsample.index]
        values.append(profit(target_subsample, pred_subsample, 200))

    values = pd.Series(values)
    losses = (values < 0 ).mean() * 100
   

    print((f'{values.quantile(0.025):,}', f'{values.quantile(0.975):,}'))
    print('Mean Profit: ', f'{values.mean():,}')
    print('Losses : ', losses)   
    return


# Region 0
profit_risk(merged_0.target, merged_0.predictions)

('-682,314.259189359', '9,774,510.185401097')
Mean Profit:  4,533,870.619212874
Losses :  4.8


# Region 1
profit_risk(merged_1.target, merged_1.predictions)

('681,283.8236853313', '8,630,951.198799688')
Mean Profit:  4,945,452.4751050165
Losses :  1.0999999999999999


# Region 2
profit_risk(merged_2.target, merged_2.predictions)

('-1,538,054.1441032141', '9,219,736.453033209')
Mean Profit:  3,756,892.0224074163
Losses :  8.3

	count	mean	std	min	25%	50%	75%	max
target	25000.0	92.449669	44.183614	0.000000	56.610227	91.61242	128.522832	185.355615
predictions	25000.0	92.429517	23.254610	-9.764724	76.659319	92.45290	108.449070	180.227277

	count	mean	std	min	25%	50%	75%	max
target	25000.0	68.671662	46.043907	0.000000	26.953261	57.085625	107.813044	137.945408
predictions	25000.0	68.681022	46.042978	-2.061922	28.391403	58.319042	109.280975	139.984148

	count	mean	std	min	25%	50%	75%	max
target	25000.0	95.077342	44.699862	0.009204	59.634772	94.882039	130.685889	190.029838
predictions	25000.0	94.911352	19.797603	16.341091	81.549013	94.925208	108.410820	173.959528

OilyGiant Model ¶

Purpose ¶

Read Data ¶

Prepare Data for Models ¶

Model Training ¶

First Region ¶

Second Region ¶

Third Region ¶

Comparing Regions ¶

Region 0 ¶

Region 1 ¶

Region 2 ¶

Break Even Point ¶

Profit of Selected Region ¶

Profit per Top 200 Wells per Region based on Budget ¶

Risk and Profit for each Region ¶

Confidence Intervals ¶

Profit Risk ¶

Region 0 ¶

Region 1 ¶

Region 2 ¶

Results ¶

Conclusion ¶