# !pip install --user plotly_express


# import libraries
import pandas as pd 
import numpy as np
from numpy import genfromtxt
import plotly_express as px
import torch
import transformers
from tqdm.auto import tqdm
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn import svm
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import f1_score, classification_report
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, KFold


# read dataset
df = pd.read_csv('datasets/imdb_reviews.tsv', sep='\t')


# look at dataset
df.head()


# look at columns
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 47331 entries, 0 to 47330
Data columns (total 17 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   tconst           47331 non-null  object 
 1   title_type       47331 non-null  object 
 2   primary_title    47331 non-null  object 
 3   original_title   47331 non-null  object 
 4   start_year       47331 non-null  int64  
 5   end_year         47331 non-null  object 
 6   runtime_minutes  47331 non-null  object 
 7   is_adult         47331 non-null  int64  
 8   genres           47331 non-null  object 
 9   average_rating   47329 non-null  float64
 10  votes            47329 non-null  float64
 11  review           47331 non-null  object 
 12  rating           47331 non-null  int64  
 13  sp               47331 non-null  object 
 14  pos              47331 non-null  int64  
 15  ds_part          47331 non-null  object 
 16  idx              47331 non-null  int64  
dtypes: float64(2), int64(5), object(10)
memory usage: 6.1+ MB


# looking at missing values
df.isna().sum()

tconst             0
title_type         0
primary_title      0
original_title     0
start_year         0
end_year           0
runtime_minutes    0
is_adult           0
genres             0
average_rating     2
votes              2
review             0
rating             0
sp                 0
pos                0
ds_part            0
idx                0
dtype: int64


# looking at missing values
df[df.average_rating.isna()]


# Checking for duplicates
df.duplicated().sum()

0


# summary statistics on columns
df.describe()


# correlation of columns
df.corr()


# values of target
df.pos.value_counts()

0    23715
1    23616
Name: pos, dtype: int64


# positive classes in training set
df.query("ds_part== 'train' and pos==1")['pos'].count()

11884


# negative classes in training set
df.query("ds_part== 'train' and pos==0")['pos'].count()

11912


# positive classes in training set
df.query("ds_part== 'test' and pos==1")['pos'].count()

11732


# negative classes in training set
df.query("ds_part== 'test' and pos==0")['pos'].count()

11803


# boxplots for columns 
columns = ['start_year', 'is_adult', 'average_rating',
       'votes', 'rating']
for column in columns: px.box(df[column], title='Distribution of ' + str.upper(column).replace('_', ' '),           template='ggplot2', labels={'value': str.upper(column).replace('_', ' ') }).show()


# distributions
for column in columns: px.histogram(df[column], title='Distribution of ' + str.upper(column).replace('_', ' '),           template='seaborn', labels={'value': str.upper(column).replace('_', ' ') }).show()


df.describe()


# separate train and test
train = df[df.ds_part=='train']
test = df[df.ds_part=='test']


# drop unnecessary columns
train = train.drop(columns=['tconst', 'title_type', 'primary_title', 'original_title', 'start_year',
       'end_year', 'runtime_minutes', 'is_adult', 'genres', 'average_rating',
       'votes', 'rating', 'sp', 'ds_part', 'idx'])



test = test.drop(columns=['tconst', 'title_type', 'primary_title', 'original_title', 'start_year',
       'end_year', 'runtime_minutes', 'is_adult', 'genres', 'average_rating',
       'votes', 'rating', 'sp', 'ds_part', 'idx'])


# function to preprocess data for modelling 
def preprocess(df, max_sample, batch_size=200):
    max_sample_size = max_sample # set the max sample size

    # preprocessing and BERT
    tokenizer = transformers.BertTokenizer.from_pretrained('bert-base-uncased')

    ids_list_df = []
    attention_mask_list_df = []

    max_length = 512

    for input_text in df.iloc[:max_sample_size]['review']:
        ids = tokenizer.encode(input_text.lower(), add_special_tokens=True, truncation=True, max_length=max_length)
        padded = np.array(ids + [0]*(max_length - len(ids)))
        attention_mask = np.where(padded != 0, 1, 0)
        ids_list_df.append(padded)
        attention_mask_list_df.append(attention_mask)
    
    # get embeddings 
    config = transformers.BertConfig.from_pretrained('bert-base-uncased')
    model = transformers.BertModel.from_pretrained('bert-base-uncased')

    batch_size = batch_size    # typically the batch size is equal to 100 but we can set it to lower values to lower the memory requirements

    embeddings_df = []

    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')  # will use cpu unless cuda is available
    print(f'Using the {device} device.')
    model.to(device)

    for i in tqdm(range(len(ids_list_df) // batch_size)):
        
        ids_batch_df = torch.LongTensor(ids_list_df[batch_size*i:batch_size*(i+1)]).to(device)
        attention_mask_batch_df = torch.LongTensor(attention_mask_list_df[batch_size*i:batch_size*(i+1)]).to(device)

        with torch.no_grad():
            model.eval()
            batch_embeddings = model(ids_batch_df, attention_mask=attention_mask_batch_df)

        embeddings_df.append(batch_embeddings[0][:,0,:].detach().cpu().numpy())

    features_df = np.concatenate(embeddings_df)  # create features
    target_df = df.iloc[:max_sample_size]['pos'] # create target with matching length as features

    print(features_df.shape)  # illustrate matching length
    print(target_df.shape)   # illustrate matching length
    
    return features_df, target_df  # return the features and target dataframes


# processing training data 
# features_train, target_train = preprocess(train, 11884, 400)


# processing testing data 
# features_test, target_test = preprocess(test, 11732, 400)


# saving the arrays

# np.savetxt("datasets/features_train.csv", features_train, delimiter=",")
# np.savetxt("datasets/target_train.csv", target_train[:11600], delimiter=",")

# np.savetxt("datasets/features_test.csv", features_test, delimiter=",")
# np.savetxt("datasets/target_test.csv", target_test[:11600], delimiter=",")


# loading saved preprocessed data
features_train = genfromtxt('datasets/features_train.csv', delimiter=',')
target_train = genfromtxt('datasets/target_train.csv', delimiter=',')
features_test = genfromtxt('datasets/features_test.csv', delimiter=',')
target_test = genfromtxt('datasets/target_test.csv', delimiter=',')


# Checking feature train shape
features_train.shape

(11600, 768)


# checking target train shape
target_train.shape

(11600,)


# checking features test shape
features_test.shape

(11600, 768)


# checking target test shape
target_test.shape

(11600,)


# Classifier pipeline
# Runs through different classifiers to pick the best performing model
pipe_lr = Pipeline([('lr_classifier', LogisticRegression(random_state=19, max_iter=1000))])
pipe_dt = Pipeline([('dt_classifier', DecisionTreeClassifier(random_state=19))])
pipe_rf = Pipeline([('rf_classifier', RandomForestClassifier(random_state=19))])
pipe_sv = Pipeline([('svm_classifier', svm.SVC(random_state=19))])
pipe_nb = Pipeline([('nb_classifier', GaussianNB())])
pipe_kn = Pipeline([('knn_classifier', KNeighborsClassifier(n_neighbors=3))])
pipe_sg = Pipeline([('sgd_classifier', SGDClassifier(random_state=19))])
pipe_xg = Pipeline([('xgb_classifier', XGBClassifier(random_state=19))])

pipelines = [pipe_lr, pipe_dt, pipe_rf, pipe_sv, pipe_nb, pipe_kn, pipe_sg, pipe_xg]

best_accuracy = 0
best_classifier = 0
best_pipeline = ""

pipe_dict = {0: 'Logistic Regression', 1: 'Decision Tree', 2: 'Random Forest', 3: 'SVM', 4: 'Naive-Bayes', 5: 'KNN', 6: 'SGD', 7: 'XGB'}

# Use cross-validation to evaluate the models
for i, model in enumerate(pipelines):
    scores = cross_val_score(model, features_train, target_train, cv=3, scoring='f1')
    print('{} Cross-Validation F1 Score: {:.2f}'.format(pipe_dict[i], scores.mean()))
    if scores.mean() > best_accuracy:
        best_accuracy = scores.mean()
        best_pipeline = model
        best_classifier = i

# Print the best classifier
print('\nClassifier with the best F1 score: {}'.format(pipe_dict[best_classifier]))

Logistic Regression Cross-Validation F1 Score: 0.86
Decision Tree Cross-Validation F1 Score: 0.69
Random Forest Cross-Validation F1 Score: 0.81
SVM Cross-Validation F1 Score: 0.86
Naive-Bayes Cross-Validation F1 Score: 0.76
KNN Cross-Validation F1 Score: 0.72
SGD Cross-Validation F1 Score: 0.81
XGB Cross-Validation F1 Score: 0.83

Classifier with the best F1 score: SVM


# classifier scores
scores_df = pd.DataFrame({'Model': pipe_dict.values(), 'Score': [0.86, 0.69, 0.81, 0.86, 0.76, 0.72, 0.81, .83]})
fig = px.scatter(scores_df, x='Model', y='Score', size='Score', color='Model')
fig.show()


# logistic regression
pipe_lr.fit(features_train, target_train)
scores_lr = cross_val_score(pipe_lr, features_train, target_train, cv=3, scoring='f1')
print('Mean Score: ', scores_lr.mean(), '\nScore std : +/-', scores_lr.std())

Mean Score:  0.8567922248324615 
Score std : +/- 0.007115322528754756


# SVC  
pipe_sv.fit(features_train, target_train)
scores_sv = cross_val_score(pipe_sv, features_train, target_train, cv=3, scoring='f1')
print('Mean Score: ', scores_sv.mean(), '\nScore std : +/-', scores_sv.std())

Mean Score:  0.8582101543666933 
Score std : +/- 0.008600219969896489


# SGD
pipe_sg.fit(features_train, target_train)
scores_sg = cross_val_score(pipe_sg, features_train, target_train, cv=3, scoring='f1')
print('Mean Score: ', scores_sg.mean(), '\nScore std : +/-', scores_sg.std())

Mean Score:  0.811195832207457 
Score std : +/- 0.06985625735303719


# XGB
pipe_xg.fit(features_train, target_train)
scores_xg = cross_val_score(pipe_xg, features_train, target_train, cv=3, scoring='f1')
print('Mean Score: ', scores_xg.mean(), '\nScore std : +/-', scores_xg.std())

Mean Score:  0.8347603036296536 
Score std : +/- 0.010658198482598863


# Training classifier
final = VotingClassifier(estimators=[('sgd', pipe_sg),
                                ('log', pipe_lr),
                                ('svm', pipe_sv),
                                ('xgb', pipe_xg),
                                ('rf', pipe_rf)], 
                                verbose=1)
final = final.fit(features_train, target_train)

# Make predictions on the test set
final_predictions = final.predict(features_test)

result = f1_score(target_test, final_predictions)
print()
print("voting regressor model on the test set: ", result)

[Voting] ...................... (1 of 5) Processing sgd, total=   2.5s
[Voting] ...................... (2 of 5) Processing log, total=   5.1s
[Voting] ...................... (3 of 5) Processing svm, total= 1.0min
[Voting] ...................... (4 of 5) Processing xgb, total= 1.2min
[Voting] ....................... (5 of 5) Processing rf, total=  34.4s

voting regressor model on the test set:  0.865220971726599


# creating new test data
data = {'review': ['This is the best movie ever', 'This movie was not good at all', 'I did not like this movie', 'I wasted my money'], 'pos': [1, 0, 0, 0]}
new = pd.DataFrame(data)
new


# processing new test data
features_new, _ = preprocess(new, 4, 4)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).

Using the cpu device.

  0%|          | 0/1 [00:00<?, ?it/s]

C:\Users\XIX\AppData\Local\Temp\ipykernel_22072\3780956929.py:34: UserWarning:

Creating a tensor from a list of numpy.ndarrays is extremely slow. Please consider converting the list to a single numpy.ndarray with numpy.array() before converting to a tensor. (Triggered internally at C:\actions-runner\_work\pytorch\pytorch\builder\windows\pytorch\torch\csrc\utils\tensor_new.cpp:233.)

(4, 768)
(4,)


# Logistic Regression predictions
pipe_lr.predict(features_new)

array([1., 0., 0., 0.])


# SVC predictions
pipe_sv.predict(features_new)

array([1., 0., 0., 0.])


# SGD predictions
pipe_sg.predict(features_new)

array([1., 0., 0., 0.])


# XGB predictions
pipe_xg.predict(features_new)

array([1, 0, 0, 0])


# Final model predictions
final.predict(features_new)

array([1., 0., 0., 0.])

	tconst	title_type	primary_title	original_title	start_year	end_year	runtime_minutes	genres	average_rating	votes	review	rating	sp	pos	ds_part	idx
0	tt0068152	movie	$	$	1971	\N	121	Comedy,Crime,Drama	6.3	2218.0	The pakage implies that Warren Beatty and Gold...	1	neg	0	train	8335
1	tt0068152	movie	$	$	1971	\N	121	Comedy,Crime,Drama	6.3	2218.0	How the hell did they get this made?! Presenti...	1	neg	0	train	8336
2	tt0313150	short	'15'	'15'	2002	\N	25	Comedy,Drama,Short	6.3	184.0	There is no real story the film seems more lik...	3	neg	0	test	2489
3	tt0313150	short	'15'	'15'	2002	\N	25	Comedy,Drama,Short	6.3	184.0	Um .... a serious film about troubled teens in...	7	pos	1	test	9280
4	tt0313150	short	'15'	'15'	2002	\N	25	Comedy,Drama,Short	6.3	184.0	I'm totally agree with GarryJohal from Singapo...	9	pos	1	test	9281

	start_year	is_adult	average_rating	votes	rating	pos	idx
count	47331.000000	47331.000000	47329.000000	4.732900e+04	47331.000000	47331.000000	47331.000000
mean	1989.631235	0.001732	5.998278	2.556292e+04	5.484608	0.498954	6279.697999
std	19.600364	0.041587	1.494289	8.367004e+04	3.473109	0.500004	3605.702545
min	1894.000000	0.000000	1.400000	9.000000e+00	1.000000	0.000000	0.000000
25%	1982.000000	0.000000	5.100000	8.270000e+02	2.000000	0.000000	3162.000000
50%	1998.000000	0.000000	6.300000	3.197000e+03	4.000000	0.000000	6299.000000
75%	2004.000000	0.000000	7.100000	1.397400e+04	9.000000	1.000000	9412.000000
max	2010.000000	1.000000	9.700000	1.739448e+06	10.000000	1.000000	12499.000000

	start_year	is_adult	average_rating	votes	rating	pos	idx
start_year	1.000000	-0.008444	-0.189847	0.095835	-0.187441	-0.181571	-0.002162
is_adult	-0.008444	1.000000	-0.015592	-0.012193	0.004866	0.005168	-0.017616
average_rating	-0.189847	-0.015592	1.000000	0.229570	0.509180	0.481103	-0.019296
votes	0.095835	-0.012193	0.229570	1.000000	0.054170	0.052365	-0.014679
rating	-0.187441	0.004866	0.509180	0.054170	1.000000	0.941231	0.000956
pos	-0.181571	0.005168	0.481103	0.052365	0.941231	1.000000	0.005141
idx	-0.002162	-0.017616	-0.019296	-0.014679	0.000956	0.005141	1.000000

	start_year	is_adult	average_rating	votes	rating	pos	idx
count	47331.000000	47331.000000	47329.000000	4.732900e+04	47331.000000	47331.000000	47331.000000
mean	1989.631235	0.001732	5.998278	2.556292e+04	5.484608	0.498954	6279.697999
std	19.600364	0.041587	1.494289	8.367004e+04	3.473109	0.500004	3605.702545
min	1894.000000	0.000000	1.400000	9.000000e+00	1.000000	0.000000	0.000000
25%	1982.000000	0.000000	5.100000	8.270000e+02	2.000000	0.000000	3162.000000
50%	1998.000000	0.000000	6.300000	3.197000e+03	4.000000	0.000000	6299.000000
75%	2004.000000	0.000000	7.100000	1.397400e+04	9.000000	1.000000	9412.000000
max	2010.000000	1.000000	9.700000	1.739448e+06	10.000000	1.000000	12499.000000

The Film Junky Union ¶

Purpose ¶

Read Data ¶

EDA ¶

Preprocessing ¶

Modelling Pipeline ¶

Logistic Regression ¶

Support Vector Machines ¶

Stochastic Gradient Descent ¶

eXtreme Gradient Boost ¶

Final Model ¶

Composed Reviews ¶

Model Predictions ¶

Conclusions ¶

	tconst	title_type	primary_title	original_title	start_year	end_year	runtime_minutes	is_adult	genres	average_rating	votes	review	rating	sp	pos	ds_part	idx
22280	tt0192317	movie	Mila Ass Painting	Mila Ass Painting	1998	\N	\N	0	\N	NaN	NaN	This is a truly great film, with excellent dir...	9	pos	1	test	3231
22281	tt0192317	movie	Mila Ass Painting	Mila Ass Painting	1998	\N	\N	0	\N	NaN	NaN	A film is beyond all expectations, an excellen...	10	pos	1	test	3232

	review	pos
0	This is the best movie ever	1
1	This movie was not good at all	0
2	I did not like this movie	0
3	I wasted my money	0