# import libraries
import pandas as pd
import plotly_express as px
import numpy as np  
import re 
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, OrdinalEncoder, StandardScaler
from sklearn.compose import make_column_transformer
import seaborn as sns
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Input, concatenate
from keras.optimizers import Adam
import tensorflow as tf
from numpy import genfromtxt
import torch
import transformers
from tqdm.auto import tqdm
from transformers import pipeline, AutoTokenizer, AutoModelForSequenceClassification
import math
from sklearn.metrics.pairwise import haversine_distances
from math import radians
from transformers import XLMRobertaTokenizerFast
import joblib
from math import ceil
# show graphs in html
import plotly.io as pio
pio.renderers.default = "plotly_mimetype+notebook"


# read dataset
df_main = pd.read_csv('data\Main_Dataset.csv', parse_dates=['timestamp'], index_col=['timestamp'])


# sort by timestamp
df_main.sort_index(inplace=True)


# look at dataset
df_main.head()


# check timestamp 
df_main.index.is_monotonic

False


# look at column information
df_main.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 604206 entries, 2021-02-01 12:14:04 to NaT
Data columns (total 4 columns):
 #   Column      Non-Null Count   Dtype  
---  ------      --------------   -----  
 0   id          604206 non-null  int64  
 1   text        604206 non-null  object 
 2   user_id     604206 non-null  float64
 3   cluster_id  604206 non-null  int64  
dtypes: float64(1), int64(2), object(1)
memory usage: 23.0+ MB


# looking for missing values
df_main.isna().sum()

id            0
text          0
user_id       0
cluster_id    0
dtype: int64


# looking for duplicates
df_main.duplicated().sum()

0


# data with missing index
df_main.index.isna().sum()

12794


# percentage of data with missing index
df_main.index.isna().sum() / len(df_main) * 100

2.1174897303237636


# looking at missing data with missing index
df_main[df_main.index.isna()].head()


# load cluster data
df_cl = pd.read_csv('data/Clusters_Coordinates.csv')


# look at dataset
df_cl.head()


# looking at column info
df_cl.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 850 entries, 0 to 849
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   cluster_id  850 non-null    int64  
 1   lat         850 non-null    float64
 2   lng         850 non-null    float64
dtypes: float64(2), int64(1)
memory usage: 20.0 KB


# looking for missing values
df_cl.isna().sum()

cluster_id    0
lat           0
lng           0
dtype: int64


# visual of data before feature engineering
df_main.head()


# Making timestamp features
def make_features(data):
    data['year'] = data.index.year
    data['month'] = data.index.month
    data['week'] = data.index.isocalendar().week
    data['day'] = data.index.day
    data['day_of_week'] = data.index.day_of_week 
    data['day_of_year'] = data.index.day_of_year
    data['hour'] = data.index.hour 
    data['minute'] = data.index.minute 
    data['second'] = data.index.second
    
make_features(df_main)


# new features added
df_main.head()


# merge main and cluster coordinates
df = df_main.merge(df_cl, on='cluster_id', sort=True)


# new merged dataset
df.head()


# drop missing values
df.dropna(inplace=True)


# missing values
df.isna().sum()

id             0
text           0
user_id        0
cluster_id     0
year           0
month          0
week           0
day            0
day_of_week    0
day_of_year    0
hour           0
minute         0
second         0
lat            0
lng            0
dtype: int64


# shape of dataset
df.shape

(591412, 15)


# summary statistics
df.describe()


# number of unique users
df.user_id.nunique()

41143


# number of unique clusters
df.cluster_id.nunique()

850


# number of unique latitudes
df.lat.nunique()

811


# number of unique longitudes
df.lng.nunique()

828


# skew of data
df.skew()

C:\Users\XIX\AppData\Local\Temp\ipykernel_23156\3355588477.py:2: FutureWarning:

Dropping of nuisance columns in DataFrame reductions (with 'numeric_only=None') is deprecated; in a future version this will raise TypeError.  Select only valid columns before calling the reduction.

id            -0.013441
user_id        0.998650
cluster_id     0.794588
year           0.000000
month         -1.439251
week          -1.408017
day           -0.000922
day_of_week    0.061273
day_of_year   -1.408617
hour          -0.182448
minute        -0.005914
second        -0.006647
lat           -0.597998
lng           -0.664069
dtype: float64


# correlation of data
px.imshow(df.corr(), text_auto=True, aspect='auto')


# distributions of columns
columns = ['month', 'week', 'day', 'day_of_week', 'day_of_year', 'hour', 'minute', 'second']
for column in columns:
    px.histogram(df[column], title='Distribution of '+ str.upper(column).replace('_', ' '), labels={'value': str(column).replace('_', ' ')}).show()


# user id distribution
px.histogram(df.user_id.value_counts(), title='Distribution of User Id\'s')


# cluster id distribution
px.histogram(df.cluster_id.value_counts(), title='Distribution of Cluster Id\'s')


# looking through tweets
tweets = df.text.tolist()
tweets[10:21]

['i wish everyone goodness',
 'sick in the head clowns',
 'as an adult i learned boundaries are so important',
 'haha so much fun',
 'tommy from power is in this shit hes gonna always be tommy from power to me lmao',
 'i just casually lose joints in my hair sometimes because i refuse to stop tucking them behind my ear',
 'rep. katie porter reveals aoc sought refuge in her office during the us capitol resurrection. porter said “greetings i have coffee. i’m a mom i have everything we need”. aoc replied “i hope i get to be a mom. i hope i don’t die today”.',
 'i’m simply never sleeping again',
 'i just want my cousin to come back home to his wife amp kids...fuck you covid',
 'michigan’s governor attorney general and secretary of state all file complaints to disbar trump’s election fraud attorney sidney powell for perpetuating ‘the big lie’.',
 'a week after permanently suspending the my pillow guy’s twitter account twitter permanently suspends the my pillow corporate account because the my pillow guy tried to circumvent his ban by tweeting on it.']


# function to preprocess data for modelling 
def preprocess_xlm(df, max_sample, batch_size=200):
    max_sample_size = max_sample # set the max sample size

    # preprocessing and XLM-RoBERTa
    tokenizer = transformers.XLMRobertaTokenizerFast.from_pretrained('xlm-roberta-large')

    ids_list_df = []
    attention_mask_list_df = []

    max_length = 512

    for input_text in df.iloc[:max_sample_size]['text']:
        ids = tokenizer.encode(input_text.lower(), add_special_tokens=True, truncation=True, max_length=max_length)
        padded = np.array(ids + [0]*(max_length - len(ids)))
        attention_mask = np.where(padded != 0, 1, 0)
        ids_list_df.append(padded)
        attention_mask_list_df.append(attention_mask)
    
    # get embeddings 
    config = transformers.XLMRobertaConfig.from_pretrained('xlm-roberta-large')
    model = transformers.XLMRobertaModel.from_pretrained('xlm-roberta-large')

    batch_size = batch_size    # typically the batch size is equal to 100 but we can set it to lower values to lower the memory requirements

    embeddings_df = []

    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')  # will use cpu unless cuda is available
    print(f'Using the {device} device.')
    model.to(device)

    for i in tqdm(range(len(ids_list_df) // batch_size)):
        
        ids_batch_df = torch.LongTensor(ids_list_df[batch_size*i:batch_size*(i+1)]).to(device)
        attention_mask_batch_df = torch.LongTensor(attention_mask_list_df[batch_size*i:batch_size*(i+1)]).to(device)

        with torch.no_grad():
            model.eval()
            batch_embeddings = model(ids_batch_df, attention_mask=attention_mask_batch_df)

        embeddings_df.append(batch_embeddings[0][:,0,:].detach().cpu().numpy())

    X = np.concatenate(embeddings_df)  # create features
    y = df.iloc[:max_sample_size][['lat', 'lng']] # create target with matching length as features

    print(X.shape)  # illustrate matching length
    print(y.shape)   # illustrate matching length
    
    return X, y  # return the features and target dataframes


#np.savetxt("processed data/X_xlm_1000.csv", X_xlm, delimiter=",")
#y_xlm.to_csv('processed data/y_xlm_1000.csv', header=False, index=False)


#loading time features
df = pd.read_csv('processed data/df.csv')


# processed embeddings
X_xlm = pd.read_csv('inputs/X_xlm.csv')


# shape of embeddings
X_xlm.shape

(591412, 1024)


# load coordinates as target variables
y = pd.read_csv('inputs/y.csv')


# look at targets
y.head()


# train test split
X_train, X_test, y_train, y_test = train_test_split(
    X_xlm, y, test_size=0.2, random_state=19) # split 20% of data to make validation set


# haversine distance loss
RADIUS_KM = 6378.1

def degrees_to_radians(deg):
    pi_on_180 = 0.017453292519943295
    return deg * pi_on_180

def loss_haversine(observation, prediction):    
    obv_rad = tf.map_fn(degrees_to_radians, observation)
    prev_rad = tf.map_fn(degrees_to_radians, prediction)

    dlon_dlat = obv_rad - prev_rad 
    v = dlon_dlat / 2
    v = tf.sin(v)
    v = v**2

    a = v[:,1] + tf.cos(obv_rad[:,1]) * tf.cos(prev_rad[:,1]) * v[:,0] 

    c = tf.sqrt(a)
    c = 2* tf.math.asin(c)
    c = c*RADIUS_KM
    final = tf.reduce_sum(c)

    #if you're interested in having MAE with the haversine distance in KM
    #uncomment the following line
    final = final/tf.dtypes.cast(tf.shape(observation)[0], dtype= tf.float32)

    return final


tf.random.set_seed(19)
optimizer = Adam(learning_rate=.0001)
callback = tf.keras.callbacks.EarlyStopping(monitor='loss', patience=5)
reduce_lr = tf.keras.callbacks.ReduceLROnPlateau(monitor='val_loss', factor=0.1,
                              patience=5, min_lr=0.0000001)

# define the model architecture
model = Sequential()
model.add(Dense(8000, activation='relu', input_dim=(X_train.shape[1])))
model.add(Dense(4000, activation='relu'))
model.add(Dense(2000, activation='relu'))
model.add(Dense(1000, activation='relu'))
model.add(Dense(500, activation='relu'))
model.add(Dense(2)) # output layer with 2 units for latitude and longitude

# compile the model
model.compile(optimizer=optimizer, loss=loss_haversine, metrics=['mse'])

# train the model
with tf.device('/GPU:0'):
    history = model.fit(X_train, y_train, epochs=5, batch_size=32, validation_split=0.10, callbacks=[callback, reduce_lr], use_multiprocessing=True)


# base_model = joblib.load('models\base_model.sav')


# evaluation on test set
model.evaluate(X_test, y_test)


# X test predictions
preds = model.predict(X_test)


# chart of history metrics
import plotly.express as px
import pandas as pd

# Convert the model history to a pandas DataFrame
df_his = pd.DataFrame(history.history)

# Create separate figures for loss and accuracy
fig_loss = px.line(df_his, x=df_his.index, y=['loss', 'val_loss'], labels={'value': 'Loss', 'index': 'Epoch'}, title='Model Loss')
fig_acc = px.line(df_his, x=df_his.index, y=['mse', 'val_mse'], labels={'value': 'MSE', 'index': 'Epoch'}, title='Model MSE')
fig_lr = px.line(df_his, x=df_his.index, y='lr', labels={'value': 'Learning Rate', 'index': 'Epoch'}, title='Model Learning Rate', log_y=True)

# Show the figures
fig_loss.show()
fig_acc.show()
fig_lr.show()


# look at test data coordinates
y_test.head()


# creating predictions dataframe
preds_df = pd.DataFrame(preds, columns=['lat_p', 'lng_p'])


# reset index of test data
y_df = y_test.reset_index(drop=True)


# concatenating target and predictions
coords = pd.concat([y_df, preds_df], axis=1)


# look at combined dataframe of coordinates
coords.head()


# convert test set coordinates to radians   
y_test_rad = y_test * (math.pi/180)


# convert prediction coordinates to radians
preds_rad = preds * (math.pi/180)


# calculate distance
distances = haversine_distances(y_test_rad, preds_rad)[0]
distances_km = distances * (6371000/1000)


# bar graph of all distance measurements
# px.bar(distances_km, title='Distances Between Actual and Prediction', labels={'value': 'Distance (Km)'}, template='presentation')


# box plot of distance measurements
px.box(distances_km, title='Distribution of Distances', labels={'value': 'Distance (Km)'}, template='plotly_white')


#sentiment analysis
# from transformers import pipeline
# sent = pipeline('sentiment-analysis', model='cardiffnlp/twitter-xlm-roberta-base-sentiment')(df.text.values.tolist())
# sent = pd.DataFrame(sent, columns=['label', 'score'])


# load sentiment dataframe
sent = pd.read_csv('inputs/sent.csv')


# rename columns 
sent.rename(columns={'label': 'sent', 'score': 'sent_score'}, inplace=True)


# distribution of sentiment in text
px.histogram(sent.sent, color=sent.sent,  title='Tweet Sentiment')


# Load the tokenizer and model
# tokenizer = AutoTokenizer.from_pretrained('ivanlau/language-detection-fine-tuned-on-xlm-roberta-base')
# model = AutoModelForSequenceClassification.from_pretrained('ivanlau/language-detection-fine-tuned-on-xlm-roberta-base')

# Set up the pipeline
# classifier = pipeline('text-classification', model=model, tokenizer=tokenizer, device_map='auto')

# Example usage
# result = []
# for text in tqdm(df.text.values.tolist()):
    # result.append(classifier(text))
# df_lan = pd.DataFrame(result, columns=['label', 'score'])


# load languages dataframe
lan = pd.read_csv('inputs/lan.csv')


# rename columns 
lan.rename(columns={'label': 'language', 'score': 'lang_score'}, inplace=True)


# counts of the different languages
lan_counts = lan.language.value_counts()


# distribution of languages in text
px.bar(lan_counts, color=lan_counts.index, title='Tweet Languages')


# NER tokenizer
# token_classifier = pipeline(model="Abderrahim2/bert-finetuned-Location")

# tokens = token_classifier(df_rand.text.to_list())
# tokens= pd.DataFrame(tokens)


# load ner dataframe
ner = pd.read_csv('inputs/ner.csv')


# counts of ner values
ner_count = ner.entity.value_counts()


# distribution of tweet locations
px.bar(ner_count, color=ner_count.index, title='Tweet Locations')


# topic classifier
# topic_classifier = pipeline(model="jonaskoenig/topic_classification_04")

# result1 = []
# for text in tqdm(df.text.values.tolist()):
#    result1.append(topic_classifier(text))


# read saved topics dataframe
topics = pd.read_csv('inputs/topics.csv')


# change topics dataframe column names
topics.columns=['topic', 'score']


# value counts of topics
topics_count= topics.topic.value_counts()


# distribution of topics in text
px.bar(topics_count, color=topics_count.index, title='Tweet Topics')


# load encoded dataframe
df = pd.read_csv('processed data/df.csv')


# look at encoded dataframe
df.head()


# merge encoded dataframe with text features
df_merge = pd.concat([df.reset_index(drop=True), sent.sent, lan.language, topics.topic, ner.entity], axis=1)


# look at merged dataframe
df_merge.head()


# shape of merged dataframe
df_merge.shape

(591412, 19)


# encoders for columns
id_encoder = LabelEncoder()
id_scaler = StandardScaler()
ord_enc = OrdinalEncoder(categories=[['negative', 'neutral', 'positive']])
label_enc = LabelEncoder()


# encode columns of dataframe
df_merge['user_id'] = id_encoder.fit_transform(df_merge[['user_id']])
df_merge[['user_id', 'cluster_id']] = id_scaler.fit_transform(df_merge[['user_id', 'cluster_id']])
df_merge['sent'] = ord_enc.fit_transform(df_merge[['sent']])
df_merge['language'] = label_enc.fit_transform(df_merge[['language']])
df_merge['topic'] = label_enc.fit_transform(df_merge[['topic']])
df_merge['entity'] = label_enc.fit_transform(df_merge[['entity']])

c:\Users\XIX\anaconda3\lib\site-packages\sklearn\preprocessing\_label.py:116: DataConversionWarning:

A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().

c:\Users\XIX\anaconda3\lib\site-packages\sklearn\preprocessing\_label.py:116: DataConversionWarning:

A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().

c:\Users\XIX\anaconda3\lib\site-packages\sklearn\preprocessing\_label.py:116: DataConversionWarning:

A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().

c:\Users\XIX\anaconda3\lib\site-packages\sklearn\preprocessing\_label.py:116: DataConversionWarning:

A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().


# look at resulting dataframe
df_merge.head()


# creating features dataframe
df_feat = df_merge.drop(columns=['id', 'lat', 'lng', 'text', 'cluster_id'])


# load saved features dataframe
df_feat = pd.read_csv('inputs/df_feat.csv')


# look at features dataframe
df_feat.head()


# shape of features dataframe
df_feat.shape

(591412, 14)


# merge embeddings with features
final_df = df_merge.merge(pd.DataFrame(X_xlm), left_index=True, right_index=True).drop(columns=['text', 'id'])


# shape of final df
final_df.shape

(591412, 1041)


# loading dataframe of embeddings merged with features as xlm_merged
xlm_merged = pd.read_csv('inputs/X_merged.csv')
xlm_merged.shape

(591412, 1038)


# target dataframe of coordinates
y.head()


final_X_array = np.array(X_).astype('float64')
final_y_array = np.array(y).astype('float64')


feat_array = np.array(df_feat).astype('float64')


# train test split
X_train, X_test, y_train, y_test = train_test_split(
    xlm_merged, y, test_size=0.2, random_state=19) # split 20% of data to make validation set


tf.random.set_seed(19)
optimizer = Adam(learning_rate=.0001)
callback = tf.keras.callbacks.EarlyStopping(monitor='loss', patience=2)
reduce_lr = tf.keras.callbacks.ReduceLROnPlateau(monitor='val_loss', factor=0.1,
                              patience=2, min_lr=0.000000001)

# define the model architecture
input_shape = X_train.shape[1]
input_layer = Input(shape=(input_shape,))

# Branch 1
branch1 = Dense(8000, activation='relu')(input_layer[:,:1024])
branch1 = Dense(4000, activation='relu')(branch1)
branch1 = Dense(2000, activation='relu')(branch1)
branch1 = Dense(1000, activation='relu')(branch1)
branch1 = Dense(500, activation='relu')(branch1)
output1 = Dense(1)(branch1)

# Branch 2
branch2 = Dense(8000, activation='relu')(input_layer[:,1024:1025])
branch2 = Dense(4000, activation='relu')(branch2)
branch2 = Dense(2000, activation='relu')(branch2)
branch2 = Dense(1000, activation='relu')(branch2)
branch2 = Dense(500, activation='relu')(branch2)
output2 = Dense(1)(branch2)

# Branch 3
branch3 = Dense(8000, activation='relu')(input_layer[:,1025:1035])
branch3 = Dense(4000, activation='relu')(branch3)
branch3 = Dense(2000, activation='relu')(branch3)
branch3 = Dense(1000, activation='relu')(branch3)
branch3 = Dense(500, activation='relu')(branch3)
output3 = Dense(1)(branch3)

# Branch 4
branch4 = Dense(8000, activation='relu')(input_layer[:,1035:])
branch4 = Dense(4000, activation='relu')(branch4)
branch4 = Dense(2000, activation='relu')(branch4)
branch4 = Dense(1000, activation='relu')(branch4)
branch4 = Dense(500, activation='relu')(branch4)
output4 = Dense(1)(branch4)

# Concatenate the outputs of the three branches
merged = concatenate([output1, 
                      output2, 
                      #output3, 
                      output4
                     ], axis=1)

# Final output layer
output_layer = Dense(2)(merged)

# define the model
model = Model(inputs=[input_layer], outputs=[output_layer])

# compile the model
model.compile(optimizer=optimizer, loss=loss_haversine, metrics=['mse'])

# train the model
with tf.device('/GPU:0'):
    history = model.fit(X_train, y_train, epochs=100, batch_size=6000, validation_split=0.10, 
                        callbacks=[callback, reduce_lr], use_multiprocessing=True)


# evaluate model on test set
model.evaluate(X_test, y_test)


# model predictions
final_preds = model.predict(X_test)


# load history of model
df_his = pd.read_csv('history/history.csv')


# Create separate figures for loss and accuracy
fig_loss = px.line(df_his, x=df_his.index, y=['loss', 'val_loss'], labels={'value': 'Loss', 'index': 'Epoch'}, title='Model Loss')
fig_acc = px.line(df_his, x=df_his.index, y=['mse', 'val_mse'], labels={'value': 'MSE', 'index': 'Epoch'}, title='Model MSE')
fig_lr = px.line(df_his, x=df_his.index, y='lr', labels={'value': 'Learning Rate', 'index': 'Epoch'}, title='Model Learning Rate', log_y=True)

# Show the figures
fig_loss.show()
fig_acc.show()
fig_lr.show()


# convert test set coordinates to radians   
y_test_rad = y_test * (math.pi/180)

# convert prediction coordinates to radians
preds_rad = final_preds * (math.pi/180)


# calculate distances in batches
batch_size = 10000
num_batches = ceil(len(y_test_rad) / batch_size)

distances_km = []

for i in range(num_batches):
    start = i * batch_size
    end = start + batch_size
    y_batch = y_test_rad[start:end]
    preds_batch = preds_rad[start:end]
    distances = haversine_distances(y_batch, preds_batch)[0]
    distances_km += list(distances * 6371)


# load calculated distances
distances_km_final = pd.read_csv('inputs/distances/distances_km.csv')


# box plot of calculated distance differences from actual
px.box(distances_km_final, title='Distribution of Distances', labels={'value': 'Distance (Km)'}, template='plotly_white')

	id	text	user_id	cluster_id
timestamp
2021-02-01 12:14:04	262304	he was accused of being a thief when entering ...	8.301517e+08	345
2021-02-01 12:19:19	480231	can you blame him they are delicious	1.817364e+08	1775
2021-02-01 12:21:52	241532	damn sholl is a new month ain’t it	2.383825e+08	288
2021-02-01 12:28:10	324986	ain’t felt this way inna min	4.598618e+08	603
2021-02-01 12:35:05	541682	golf central is practically unwatchable this m...	4.041955e+09	2318

	id	text	user_id	cluster_id
timestamp
NaT	80822	josh in to close it out.“filthy”in the 6th. ba...	2.968974e+09	26
NaT	80828	it passed but it’s beautiful	1.549382e+07	26
NaT	80830	pure sunshine flowerreport toronto	1.292381e+07	26
NaT	80832	you beat me to it. so tired of the population ...	1.649766e+07	26
NaT	80838	i’m on a boat toronto island time	1.924758e+09	26

	cluster_id	lat	lng
0	2	34.020789	-118.411907
1	3	31.168893	-100.076888
2	8	29.838495	-95.446487
3	9	40.780709	-73.968542
4	16	40.004866	-75.117998

	id	text	user_id	cluster_id
timestamp
2021-02-01 12:14:04	262304	he was accused of being a thief when entering ...	8.301517e+08	345
2021-02-01 12:19:19	480231	can you blame him they are delicious	1.817364e+08	1775
2021-02-01 12:21:52	241532	damn sholl is a new month ain’t it	2.383825e+08	288
2021-02-01 12:28:10	324986	ain’t felt this way inna min	4.598618e+08	603
2021-02-01 12:35:05	541682	golf central is practically unwatchable this m...	4.041955e+09	2318

	id	text	user_id	cluster_id	year	month	week	day	day_of_week	day_of_year	hour	minute	second
timestamp
2021-02-01 12:14:04	262304	he was accused of being a thief when entering ...	8.301517e+08	345	2021.0	2.0	5	1.0	0.0	32.0	12.0	14.0	4.0
2021-02-01 12:19:19	480231	can you blame him they are delicious	1.817364e+08	1775	2021.0	2.0	5	1.0	0.0	32.0	12.0	19.0	19.0
2021-02-01 12:21:52	241532	damn sholl is a new month ain’t it	2.383825e+08	288	2021.0	2.0	5	1.0	0.0	32.0	12.0	21.0	52.0
2021-02-01 12:28:10	324986	ain’t felt this way inna min	4.598618e+08	603	2021.0	2.0	5	1.0	0.0	32.0	12.0	28.0	10.0
2021-02-01 12:35:05	541682	golf central is practically unwatchable this m...	4.041955e+09	2318	2021.0	2.0	5	1.0	0.0	32.0	12.0	35.0	5.0

Yachay.ai ¶

Purpose ¶

Introduction ¶

Main Dataset ¶

Cluster Coordinates ¶

Feature Engineering ¶

EDA ¶

Distribution of Coordinates ¶

NLP ¶

XLM ¶

Sample of larger dataset ¶

Loading Processed Data ¶

XLM Model ¶

Target Dataframe ¶

Neural Network ¶

Artificial Neural Network ¶

Loading Models ¶

Evaluating Models ¶

Haversine distance ¶

Sentiment Analysis ¶

Language Detection ¶

Named Entity Recognition ¶

Topic Classification ¶

Adding Features to Sample Data ¶

Turn Features into Arrays ¶

Final Model ¶

Conclusions ¶

	id	text	user_id	cluster_id	year	month	week	day	day_of_week	day_of_year	hour	minute	second	lat	lng
0	4080	i moved all day yesterday slept for 2 hours la...	2.688113e+07	2	2021.0	2.0	5	2.0	1.0	33.0	8.0	23.0	23.0	34.020789	-118.411907
1	10213	in the vortex 16x20 acrylicpainting	8.080801e+07	2	2021.0	2.0	5	2.0	1.0	33.0	8.0	28.0	8.0	34.020789	-118.411907
2	12514	pes21 is free lmfao	1.286578e+18	2	2021.0	2.0	5	2.0	1.0	33.0	8.0	28.0	19.0	34.020789	-118.411907
3	10843	ha yeah there’s no way of ever really knowing ...	1.759526e+07	2	2021.0	2.0	5	2.0	1.0	33.0	8.0	31.0	49.0	34.020789	-118.411907
4	16316	. shut the fuck up you fake ass nerd.	1.081937e+08	2	2021.0	2.0	5	2.0	1.0	33.0	8.0	33.0	42.0	34.020789	-118.411907

	id	user_id	cluster_id	year	month	week	day	day_of_week	day_of_year	hour	minute	second	lat	lng
count	591412.000000	5.914120e+05	591412.000000	591412.0	591412.000000	591412.000000	591412.000000	591412.000000	591412.000000	591412.000000	591412.000000	591412.000000	591412.000000	591412.000000
mean	303001.854719	3.368510e+17	886.351569	2021.0	6.766528	27.205567	15.575266	2.928047	190.367018	11.887419	29.533308	29.478575	34.433504	-92.672279
std	174405.411743	5.209262e+17	895.607222	0.0	2.434100	10.535272	8.408276	2.018068	73.657062	7.792265	17.330192	17.359607	7.609369	16.107334
min	0.000000	6.070000e+02	2.000000	2021.0	2.000000	5.000000	1.000000	0.000000	32.000000	0.000000	0.000000	0.000000	13.189300	-158.069430
25%	152142.750000	1.448996e+08	105.000000	2021.0	8.000000	31.000000	8.000000	1.000000	214.000000	4.000000	15.000000	14.000000	30.215828	-100.076888
50%	304025.500000	8.397085e+08	502.000000	2021.0	8.000000	32.000000	16.000000	3.000000	224.000000	14.000000	30.000000	30.000000	34.182160	-90.079239
75%	453573.250000	8.556889e+17	1567.000000	2021.0	8.000000	33.000000	23.000000	5.000000	233.000000	19.000000	45.000000	44.000000	40.624274	-79.850739
max	604205.000000	1.431375e+18	2996.000000	2021.0	9.000000	35.000000	31.000000	6.000000	244.000000	23.000000	59.000000	59.000000	61.235042	-52.829425

	id	text	user_id	cluster_id	year	month	week	day	day_of_week	day_of_year	hour	minute	second	lat	lng	sent	language	topic	entity
0	4080	i moved all day yesterday slept for 2 hours la...	-1.465321	-0.987433	2021.0	2.0	5	2.0	1.0	33.0	8.0	23.0	23.0	34.020789	-118.411907	0.0	11	5	1
1	10213	in the vortex 16x20 acrylicpainting	-1.083253	-0.987433	2021.0	2.0	5	2.0	1.0	33.0	8.0	28.0	8.0	34.020789	-118.411907	1.0	11	7	1
2	12514	pes21 is free lmfao	1.451286	-0.987433	2021.0	2.0	5	2.0	1.0	33.0	8.0	28.0	19.0	34.020789	-118.411907	1.0	24	9	2
3	10843	ha yeah there’s no way of ever really knowing ...	-1.615876	-0.987433	2021.0	2.0	5	2.0	1.0	33.0	8.0	31.0	49.0	34.020789	-118.411907	2.0	11	8	1
4	16316	. shut the fuck up you fake ass nerd.	-0.982137	-0.987433	2021.0	2.0	5	2.0	1.0	33.0	8.0	33.0	42.0	34.020789	-118.411907	0.0	11	6	1

	user_id	year	month	week	day	day_of_week	day_of_year	hour	minute	second	sent	language	topic	entity
0	-0.646639	2021.0	2.0	5	2.0	1.0	33.0	8.0	23.0	23.0	0.0	11	5	1
1	-0.646639	2021.0	2.0	5	2.0	1.0	33.0	8.0	28.0	8.0	1.0	11	7	1
2	1.823152	2021.0	2.0	5	2.0	1.0	33.0	8.0	28.0	19.0	1.0	24	9	2
3	-0.646639	2021.0	2.0	5	2.0	1.0	33.0	8.0	31.0	49.0	2.0	11	8	1
4	-0.646639	2021.0	2.0	5	2.0	1.0	33.0	8.0	33.0	42.0	0.0	11	6	1