!pip install --user -U plotly_express

Requirement already satisfied: plotly_express in c:\users\xix\appdata\roaming\python\python39\site-packages (0.4.1)
Requirement already satisfied: numpy>=1.11 in c:\users\xix\anaconda3\lib\site-packages (from plotly_express) (1.23.5)
Requirement already satisfied: patsy>=0.5 in c:\users\xix\anaconda3\lib\site-packages (from plotly_express) (0.5.2)
Requirement already satisfied: scipy>=0.18 in c:\users\xix\anaconda3\lib\site-packages (from plotly_express) (1.8.1)
Requirement already satisfied: plotly>=4.1.0 in c:\users\xix\anaconda3\lib\site-packages (from plotly_express) (5.6.0)
Requirement already satisfied: statsmodels>=0.9.0 in c:\users\xix\anaconda3\lib\site-packages (from plotly_express) (0.13.2)
Requirement already satisfied: pandas>=0.20.0 in c:\users\xix\anaconda3\lib\site-packages (from plotly_express) (1.4.2)
Requirement already satisfied: pytz>=2020.1 in c:\users\xix\anaconda3\lib\site-packages (from pandas>=0.20.0->plotly_express) (2021.3)
Requirement already satisfied: python-dateutil>=2.8.1 in c:\users\xix\anaconda3\lib\site-packages (from pandas>=0.20.0->plotly_express) (2.8.2)
Requirement already satisfied: six in c:\users\xix\anaconda3\lib\site-packages (from patsy>=0.5->plotly_express) (1.16.0)
Requirement already satisfied: tenacity>=6.2.0 in c:\users\xix\anaconda3\lib\site-packages (from plotly>=4.1.0->plotly_express) (8.0.1)
Requirement already satisfied: packaging>=21.3 in c:\users\xix\anaconda3\lib\site-packages (from statsmodels>=0.9.0->plotly_express) (21.3)
Requirement already satisfied: pyparsing!=3.0.5,>=2.0.2 in c:\users\xix\anaconda3\lib\site-packages (from packaging>=21.3->statsmodels>=0.9.0->plotly_express) (3.0.4)


# Import useful packages
import pandas as pd
import numpy as np
import math as mt
from scipy import stats as st
import plotly.express as px
import matplotlib.pyplot as plt
import plotly.graph_objects as go


# Read the dataframe
df = pd.read_csv('datasets/games.csv')


# Visual of the Dataframe
display(df.head())


# We see the columns need to be changed to lowercase
df.columns

Index(['Name', 'Platform', 'Year_of_Release', 'Genre', 'NA_sales', 'EU_sales',
       'JP_sales', 'Other_sales', 'Critic_Score', 'User_Score', 'Rating'],
      dtype='object')


# Converting columns to lowercase
df.columns = df.columns.str.lower()


# Info of dataframe
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16715 entries, 0 to 16714
Data columns (total 11 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   name             16713 non-null  object 
 1   platform         16715 non-null  object 
 2   year_of_release  16446 non-null  float64
 3   genre            16713 non-null  object 
 4   na_sales         16715 non-null  float64
 5   eu_sales         16715 non-null  float64
 6   jp_sales         16715 non-null  float64
 7   other_sales      16715 non-null  float64
 8   critic_score     8137 non-null   float64
 9   user_score       10014 non-null  object 
 10  rating           9949 non-null   object 
dtypes: float64(6), object(5)
memory usage: 1.4+ MB


# Description of the dataset
df.describe()


# Unique values of each column
df.nunique()

name               11559
platform              31
year_of_release       37
genre                 12
na_sales             402
eu_sales             307
jp_sales             244
other_sales          155
critic_score          82
user_score            96
rating                 8
dtype: int64


# Visual of different user score values
df['user_score'].unique()

array(['8', nan, '8.3', '8.5', '6.6', '8.4', '8.6', '7.7', '6.3', '7.4',
       '8.2', '9', '7.9', '8.1', '8.7', '7.1', '3.4', '5.3', '4.8', '3.2',
       '8.9', '6.4', '7.8', '7.5', '2.6', '7.2', '9.2', '7', '7.3', '4.3',
       '7.6', '5.7', '5', '9.1', '6.5', 'tbd', '8.8', '6.9', '9.4', '6.8',
       '6.1', '6.7', '5.4', '4', '4.9', '4.5', '9.3', '6.2', '4.2', '6',
       '3.7', '4.1', '5.8', '5.6', '5.5', '4.4', '4.6', '5.9', '3.9',
       '3.1', '2.9', '5.2', '3.3', '4.7', '5.1', '3.5', '2.5', '1.9', '3',
       '2.7', '2.2', '2', '9.5', '2.1', '3.6', '2.8', '1.8', '3.8', '0',
       '1.6', '9.6', '2.4', '1.7', '1.1', '0.3', '1.5', '0.7', '1.2',
       '2.3', '0.5', '1.3', '0.2', '0.6', '1.4', '0.9', '1', '9.7'],
      dtype=object)


# replacing TBD with a value
df['user_score'] = df['user_score'].replace(to_replace='tbd', value=0)


# Need to convert user score from object to integer
df['user_score'] = pd.to_numeric(df['user_score'], errors='coerce')


# Change release year to integer, years dont have decimals
df['year_of_release'] = df['year_of_release'].astype('int', errors='ignore')


# Checking for duplicates 
df.duplicated().sum()

0


# Checking for duplicates filtering for same name and platform
df[df[['name', 'platform']].duplicated()].value_counts(ascending=False)

name                         platform  year_of_release  genre   na_sales  eu_sales  jp_sales  other_sales  critic_score  user_score  rating
Madden NFL 13                PS3       2012.0           Sports  0.0       0.01      0.00      0.00         83.0          5.5         E         1
Need for Speed: Most Wanted  PC        2012.0           Racing  0.0       0.06      0.00      0.02         82.0          8.5         T         1
                             X360      2005.0           Racing  1.0       0.13      0.02      0.10         83.0          8.5         T         1
dtype: int64


# checking for duplicates based on same name, platform, genre
df[df[['name','platform', 'genre']].duplicated()]


# looking for games with duplicate platform and year of release as hidden duplicates
df[df[['name','platform', 'year_of_release']].duplicated()]


# Checking for Sonic
df[df['name']== 'Sonic the Hedgehog']


# Looking for Need for Speed
df[df['name']== 'Need for Speed: Most Wanted']


# Looking at Madden hidden duplicates
df[df['name']== 'Madden NFL 13']


# Dropping the hidden duplicate
df = df.drop_duplicates(subset=['name','platform', 'year_of_release'], keep='first')


# QC check
df[df[['name','platform', 'genre', 'year_of_release']].duplicated()]


# looking for hidden duplicates
df.groupby(['name', 'platform'])['name'].value_counts(ascending=False)

name                          platform  name                        
 Beyblade Burst               3DS        Beyblade Burst                 1
 Fire Emblem Fates            3DS        Fire Emblem Fates              1
 Frozen: Olaf's Quest         3DS        Frozen: Olaf's Quest           1
                              DS         Frozen: Olaf's Quest           1
 Haikyu!! Cross Team Match!   3DS        Haikyu!! Cross Team Match!     1
                                                                       ..
uDraw Studio                  Wii       uDraw Studio                    1
uDraw Studio: Instant Artist  Wii       uDraw Studio: Instant Artist    1
                              X360      uDraw Studio: Instant Artist    1
wwe Smackdown vs. Raw 2006    PS2       wwe Smackdown vs. Raw 2006      1
¡Shin Chan Flipa en colores!  DS        ¡Shin Chan Flipa en colores!    1
Name: name, Length: 16709, dtype: int64


# Looking for missing values
df.isna().sum()

name                  1
platform              0
year_of_release     269
genre                 1
na_sales              0
eu_sales              0
jp_sales              0
other_sales           0
critic_score       8577
user_score         6700
rating             6765
dtype: int64


# Visual of the rows of missing names
df[df['name'].isna()]


# Dropping the rows of missing names
df = df.dropna(subset=['name', 'genre'])


# Looking for vales with missing release years
df[df['year_of_release'].isna()]


# Mean release year per platform
df.groupby('platform')['year_of_release'].mean().round()

platform
2600    1982.0
3DO     1995.0
3DS     2013.0
DC      2000.0
DS      2008.0
GB      1996.0
GBA     2003.0
GC      2003.0
GEN     1993.0
GG      1992.0
N64     1999.0
NES     1987.0
NG      1994.0
PC      2009.0
PCFX    1996.0
PS      1998.0
PS2     2005.0
PS3     2011.0
PS4     2015.0
PSP     2009.0
PSV     2014.0
SAT     1996.0
SCD     1994.0
SNES    1994.0
TG16    1995.0
WS      2000.0
Wii     2009.0
WiiU    2014.0
X360    2010.0
XB      2004.0
XOne    2015.0
Name: year_of_release, dtype: float64


# Median release year per platform
df.groupby('platform')['year_of_release'].median()

platform
2600    1982.0
3DO     1995.0
3DS     2013.0
DC      2000.0
DS      2008.0
GB      1997.0
GBA     2003.0
GC      2003.0
GEN     1993.0
GG      1992.0
N64     1999.0
NES     1986.5
NG      1994.5
PC      2010.0
PCFX    1996.0
PS      1998.0
PS2     2005.0
PS3     2011.0
PS4     2015.0
PSP     2009.0
PSV     2014.0
SAT     1996.0
SCD     1994.0
SNES    1994.0
TG16    1995.0
WS      2000.0
Wii     2009.0
WiiU    2013.0
X360    2010.0
XB      2004.0
XOne    2015.0
Name: year_of_release, dtype: float64


# Percent of data with missing year
(df['year_of_release'].isna().sum() / df['year_of_release'].count()) * 100

1.635954509517728


# fill year of release based on median of platform grouping
df = df.dropna(subset=['year_of_release'])


# checking removal of missing values
df['year_of_release'].isna().sum()

0


# Change release year to integer
df['year_of_release'] = df['year_of_release'].astype('int')


# info on columns
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 16443 entries, 0 to 16714
Data columns (total 11 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   name             16443 non-null  object 
 1   platform         16443 non-null  object 
 2   year_of_release  16443 non-null  int32  
 3   genre            16443 non-null  object 
 4   na_sales         16443 non-null  float64
 5   eu_sales         16443 non-null  float64
 6   jp_sales         16443 non-null  float64
 7   other_sales      16443 non-null  float64
 8   critic_score     7982 non-null   float64
 9   user_score       9838 non-null   float64
 10  rating           9767 non-null   object 
dtypes: float64(6), int32(1), object(4)
memory usage: 1.4+ MB


# Looking for missing ratings
df['rating'].isna().sum()

6676


# trying to figure out missing ratings
df.groupby(['name','genre', 'platform'])['rating'].value_counts()

name                          genre         platform  rating
 Tales of Xillia 2            Role-Playing  PS3       T         1
.hack//Infection Part 1       Role-Playing  PS2       T         1
.hack//Mutation Part 2        Role-Playing  PS2       T         1
.hack//Outbreak Part 3        Role-Playing  PS2       T         1
007 Racing                    Racing        PS        T         1
                                                               ..
thinkSMART FAMILY!            Misc          Wii       E         1
thinkSMART: Chess for Kids    Misc          DS        E         1
uDraw Studio                  Misc          Wii       E         1
uDraw Studio: Instant Artist  Misc          Wii       E         1
                                            X360      E         1
Name: rating, Length: 9765, dtype: int64


# The ratings of the titles, based on the platform
df.groupby(['name', 'platform'])['rating'].value_counts()

name                          platform  rating
 Tales of Xillia 2            PS3       T         1
.hack//Infection Part 1       PS2       T         1
.hack//Mutation Part 2        PS2       T         1
.hack//Outbreak Part 3        PS2       T         1
007 Racing                    PS        T         1
                                                 ..
thinkSMART FAMILY!            Wii       E         1
thinkSMART: Chess for Kids    DS        E         1
uDraw Studio                  Wii       E         1
uDraw Studio: Instant Artist  Wii       E         1
                              X360      E         1
Name: rating, Length: 9765, dtype: int64


# Grouping titles based on rating
df.groupby('name')['rating'].value_counts()

name                          rating
 Tales of Xillia 2            T         1
.hack//Infection Part 1       T         1
.hack//Mutation Part 2        T         1
.hack//Outbreak Part 3        T         1
007 Racing                    T         1
                                       ..
thinkSMART                    E         1
thinkSMART FAMILY!            E         1
thinkSMART: Chess for Kids    E         1
uDraw Studio                  E         1
uDraw Studio: Instant Artist  E         2
Name: rating, Length: 6170, dtype: int64


# Rating count per title
df[['name', 'rating']].value_counts()

name                     rating
FIFA Soccer 13           E         8
Ratatouille              E         8
FIFA 15                  E         8
Terraria                 T         8
FIFA 14                  E         8
                                  ..
Imagine: Teacher         E         1
Imagine: Sweet 16        E         1
Imagine: Soccer Captain  E         1
Imagine: Salon Stylist   E         1
Mega Man Legends 2       E         1
Length: 6170, dtype: int64


# counting the different ratings of platforms
platform_rating = df[['platform', 'rating']].value_counts().reset_index()
display(platform_rating)


# sorting by platform
platform_rating.sort_values('platform')


# Best option 
genre_rating = df[['genre', 'rating']].value_counts()


# Changing game names to lowercase, platforms to uppercase
df['name'] = df['name'].str.lower()
df['platform'] = df['platform'].str.upper()


# scaling down critic score
df['critic_score'] = df['critic_score'] / 10


# Creating average score column
df['average_score'] = df[['critic_score', 'user_score']].mean(axis=1)


# Adding total sales column
df['total_sales'] = df[['na_sales','eu_sales', 'jp_sales', 'other_sales']].sum(axis=1)


# Adding average sales column
df['average_sales'] = df[['na_sales','eu_sales', 'jp_sales', 'other_sales']].mean(axis=1)


# Number of games released in each year
game_years = df['year_of_release'].value_counts()
game_years = pd.DataFrame(game_years)


# The years with the largest number of games released
game_years.head()


# plot of games released each year
px.bar(game_years, title='Number of Games Released Each Year').show()


# games released per genre
px.histogram(df, x='year_of_release', color='genre', title='Number of Games Released Per Genre').show()


# games released per platform
px.histogram(df, x='year_of_release', color='platform', title='Number of Games Released Per Platform').show()


# Looking at all the different platforms 
df['platform'].unique()

array(['WII', 'NES', 'GB', 'DS', 'X360', 'PS3', 'PS2', 'SNES', 'GBA',
       'PS4', '3DS', 'N64', 'PS', 'XB', 'PC', '2600', 'PSP', 'XONE',
       'WIIU', 'GC', 'GEN', 'DC', 'PSV', 'SAT', 'SCD', 'WS', 'NG', 'TG16',
       '3DO', 'GG', 'PCFX'], dtype=object)


# Looking at all the all the genres
df['genre'].unique()

array(['Sports', 'Platform', 'Racing', 'Role-Playing', 'Puzzle', 'Misc',
       'Shooter', 'Simulation', 'Action', 'Fighting', 'Adventure',
       'Strategy'], dtype=object)


# Median critic score of each platform
df.groupby('platform')['critic_score'].median()

platform
2600     NaN
3DO      NaN
3DS     6.80
DC      8.80
DS      6.60
GB       NaN
GBA     6.90
GC      7.00
GEN      NaN
GG       NaN
N64      NaN
NES      NaN
NG       NaN
PC      7.80
PCFX     NaN
PS      7.35
PS2     7.00
PS3     7.30
PS4     7.30
PSP     6.80
PSV     7.10
SAT      NaN
SCD      NaN
SNES     NaN
TG16     NaN
WII     6.50
WIIU    7.35
WS       NaN
X360    7.10
XB      7.20
XONE    7.60
Name: critic_score, dtype: float64


# Counts of different user scores
df['user_score'].value_counts()

0.0    2377
7.8     322
8.0     285
8.2     276
8.3     252
       ... 
1.5       2
0.3       2
1.1       2
1.9       2
9.7       1
Name: user_score, Length: 95, dtype: int64


# Median of user score per platform
df.groupby('platform')['user_score'].median()

platform
2600     NaN
3DO      NaN
3DS     6.30
DC      8.80
DS      0.00
GB       NaN
GBA     0.00
GC      7.60
GEN      NaN
GG       NaN
N64      NaN
NES      NaN
NG       NaN
PC      7.40
PCFX     NaN
PS      7.80
PS2     7.75
PS3     6.90
PS4     7.00
PSP     7.10
PSV     7.50
SAT      NaN
SCD      NaN
SNES     NaN
TG16     NaN
WII     4.50
WIIU    7.00
WS       NaN
X360    6.80
XB      7.50
XONE    6.60
Name: user_score, dtype: float64


# Total sales based on median user score
df.groupby('total_sales')['user_score'].median()

total_sales
0.00     NaN
0.01     5.2
0.02     5.4
0.03     4.8
0.04     5.1
        ... 
31.38    NaN
32.77    8.0
35.52    8.3
40.24    NaN
82.54    8.0
Name: user_score, Length: 1004, dtype: float64


# Count of the different ratings 
df['rating'].value_counts()

E       3920
T       2905
M       1536
E10+    1393
EC         8
K-A        3
AO         1
RP         1
Name: rating, dtype: int64


# Platform average and total sales
top_platforms = df.pivot_table(index='platform', values=['total_sales', 'average_sales'], aggfunc='sum').reset_index()


# top grossing year
top_five_years = df.pivot_table(index='year_of_release', values=['total_sales', 'average_sales'], aggfunc='sum').reset_index()
top_five_years = top_five_years.nlargest(5, columns='total_sales')


# top 5 grossing years
px.bar(top_five_years, x='year_of_release', y=['total_sales', 'average_sales'], barmode='group', title='Total Sales Per Top Release Year').show()


# platform total and average sales
px.bar(top_platforms, x='platform', y=['total_sales', 'average_sales'], barmode='group', title='Platform Sales').show()


# Platform sales by region, average and total
platform_sales = df.pivot_table(index='platform', values=['na_sales', 'eu_sales', 'jp_sales', 'other_sales', 'total_sales', 'average_sales'], aggfunc='sum').reset_index()


# platform regional sales
px.bar(platform_sales, x='platform', y=['eu_sales','na_sales', 
'jp_sales', 'other_sales'], width=1400, title='Regional Sales Per Platform').show()


# top grossing platforms
top_five_platforms = df.pivot_table(index='platform', values=['eu_sales','na_sales', 
'jp_sales', 'other_sales', 'total_sales', 'average_sales'], aggfunc='sum').reset_index()
top_five_platforms = top_five_platforms.nlargest(5, columns='total_sales')


# platform sales
px.bar(top_five_platforms, x='platform', y='total_sales',
 color='platform', title='Total Sales Per Platform').show()


# platform total and average sales
px.bar(top_five_platforms, x='platform', y=['total_sales', 'average_sales'],
barmode='group', title='Sales Per Platform', width=1200).show()


# platform sales boxplot
px.box(platform_sales, x='platform', y=['na_sales', 'eu_sales', 'jp_sales', 'other_sales'], title='Platform Sales').show()


# top grossing genre
top_five_genres = df.pivot_table(index='genre', values=['total_sales', 'average_sales'], aggfunc='sum').reset_index()
top_five_genres = top_five_genres.nlargest(5, columns='total_sales')


# sales of top 5 genres
px.bar(top_five_genres, x='genre', y='total_sales',
 color='genre', width=1300, title='Total Sales Among Top Genres').show()


# total and average sales, top 5 genres
px.bar(top_five_genres, x='genre', y=['total_sales', 'average_sales'],
width=1300, barmode='group', title='Sales Among Top Genres').show()


# share of top 5 genres
px.pie(top_five_genres, values='total_sales', names='genre').show()


# top grossing games
top_five_games = df.pivot_table(index='name', values=['total_sales', 'average_sales'], aggfunc='sum').reset_index()
top_five_games = top_five_games.nlargest(5, columns='total_sales')


# sales of top 5 games
px.bar(top_five_games, x='name', y='total_sales', 
color='name', title='Sales Among Top Games').show()


# total and average sales, top 5 games
px.bar(top_five_games, x='name', y=['total_sales', 'average_sales'], barmode='group', title='Sales Among Top Games').show()


# Sales based on critic scores
critic_sales = df.pivot_table(index='critic_score', values=['eu_sales','na_sales', 
'jp_sales', 'other_sales', 'total_sales', 'average_sales'], aggfunc='sum').reset_index()


# top grossing critic score
top_five_critic = critic_sales.nlargest(5, columns=['total_sales', 'average_sales'])


# regional sales, critic scores
px.bar(critic_sales, x='critic_score', y=['eu_sales','na_sales', 
'jp_sales', 'other_sales'], title='Regional Sales Based on Critic Scores').show()


# average sales, critic scores
px.bar(critic_sales, x='critic_score', y='average_sales', title='Average Sales Based on Critic Score').show()


# total and average sales, critic scores
px.bar(top_five_critic, x='critic_score', y=['total_sales', 'average_sales'], barmode='group',
    title='Total and Average Sales Based on Critic Score').show()


# correlation of sales and critic scores
df[['critic_score', 'eu_sales', 'jp_sales', 'na_sales', 'total_sales']].corr()


# Sorting by user score, then getting data on sales
user_sales = df.pivot_table(index='user_score', values=['eu_sales','na_sales', 
'jp_sales', 'other_sales', 'total_sales', 'average_sales'], aggfunc='sum').reset_index()


#regional sales
fig = px.bar(user_sales, x='user_score', y=['eu_sales','na_sales', 
'jp_sales', 'other_sales'], title='Regional Sales Based on User Scores', )

fig.update_xaxes(range=[0.1, 10])
fig.show()


# average sales
fig = px.bar(user_sales, x='user_score', y='average_sales', title='Average Sales Based on User Score')

fig.update_xaxes(range=[0.1, 10])
fig.show()


# top grossing user score
top_five_user = user_sales.nlargest(5, columns=['total_sales', 'average_sales'])


# sales of top scores
fig = px.bar(top_five_user, x='user_score', y=['total_sales', 'average_sales'], barmode='group', 
title='Total and Average Sales Based on User Score')

fig.update_xaxes(range=[6, 10])
fig.show()


# correlation of user score and sales
df[['user_score', 'eu_sales', 'jp_sales', 'na_sales', 'total_sales']].corr()


# Sorting data by average score, then looking at sales
average_score_sales = df.pivot_table(index='average_score', values=['eu_sales','na_sales', 
'jp_sales', 'other_sales', 'total_sales', 'average_sales'], aggfunc='sum').reset_index()


# regional sales and average scores
fig = px.bar(average_score_sales, x='average_score', y=['eu_sales','na_sales', 
'jp_sales', 'other_sales'], title='Regional Sales Based on Average Scores')

fig.update_xaxes(range=[0.1, 10])
fig.show()


# average sales, average score
fig = px.bar(average_score_sales, x='average_score', y='average_sales', title='Average Sales Based on Average Score')

fig.update_xaxes(range=[0.1, 10])
fig.show()


# Dataframe of sales based on release year
yearly_sales = df.groupby('year_of_release')['total_sales', 'average_sales'].sum().sort_values(by='total_sales', ascending=False).reset_index()

C:\Users\XIX\AppData\Local\Temp\ipykernel_23028\1022505698.py:2: FutureWarning:

Indexing with multiple keys (implicitly converted to a tuple of keys) will be deprecated, use a list instead.


# total sales, year of release
px.bar(yearly_sales, x='year_of_release', y=['total_sales', 'average_sales'], title='Total Sales by Year of Release', barmode='group').show()


# yearly regional sales pivot table
yearly_region_sales = df.pivot_table(index='year_of_release', values=['na_sales', 'eu_sales', 'jp_sales', 'other_sales', 'total_sales', 'average_sales'], aggfunc='sum').reset_index()


# plot pivot table
px.bar(yearly_region_sales, x='year_of_release', 
y=['na_sales', 'eu_sales', 'jp_sales', 'other_sales'], title='Regional Sales Based on Year of Release').show()


# plot pivot table, average sales
px.bar(yearly_region_sales, x='year_of_release', 
y='average_sales', title='Average Sales Based on Year of Release').show()


# lifespan of the different platforms
platform_lifespan = df.groupby('platform')['year_of_release'].value_counts()
platform_lifespan = pd.DataFrame(platform_lifespan)


# reset the index for cleaner dataframe
platform_years = df.groupby(['platform', 'year_of_release'])['total_sales', 'average_sales'].count().reset_index(level=0)
platform_years = platform_years.reset_index()

C:\Users\XIX\AppData\Local\Temp\ipykernel_23028\3156860180.py:2: FutureWarning:

Indexing with multiple keys (implicitly converted to a tuple of keys) will be deprecated, use a list instead.


# Make a pivot table of platform and years of release, with total sales values
platform_pivot = platform_years.pivot_table(index='platform', columns='year_of_release', values='total_sales',)


# filter pivot for recent years
relevant_pp = platform_pivot.loc[:, 2013:2016].dropna()


# Adding a lifespan count column
relevant_pp['count'] = platform_pivot.loc[:, 2013:2016].dropna().sum(axis=1)


# Extracting the count colum to show lifespan of the platforms, descending order
platform_count_years = relevant_pp.count(axis=1)
platform_count_years = platform_count_years.sort_values(ascending=False)
platform_count_years.reset_index()


# statistical overview
platform_count_years.describe()

count    9.0
mean     5.0
std      0.0
min      5.0
25%      5.0
50%      5.0
75%      5.0
max      5.0
dtype: float64


# Filtering platforms by choices
platform_pivot_filtered = platform_pivot.loc[['PS2', 'PS3', 'WII', 'X360', 'DS']].dropna(axis=1, how='all')
platform_pivot_filtered


# filter out DS in 1985
final_platform_pivot =platform_pivot_filtered.loc[:,2000:2016]
final_platform_pivot = final_platform_pivot.reset_index()
final_platform_pivot


# Total Sales column 
final_platform_pivot['total'] = final_platform_pivot.sum(axis=1)
final_platform_pivot

C:\Users\XIX\AppData\Local\Temp\ipykernel_23028\324230748.py:2: FutureWarning:

Dropping of nuisance columns in DataFrame reductions (with 'numeric_only=None') is deprecated; in a future version this will raise TypeError.  Select only valid columns before calling the reduction.


# Look at the names of the columns
final_platform_pivot.columns

Index(['platform',       2000,       2001,       2002,       2003,       2004,
             2005,       2006,       2007,       2008,       2009,       2010,
             2011,       2012,       2013,       2014,       2015,       2016,
          'total'],
      dtype='object', name='year_of_release')


# list of years 
final_platform_pivot.columns.to_list()

['platform',
 2000,
 2001,
 2002,
 2003,
 2004,
 2005,
 2006,
 2007,
 2008,
 2009,
 2010,
 2011,
 2012,
 2013,
 2014,
 2015,
 2016,
 'total']


# platform lifespans
px.bar(final_platform_pivot, x='platform', 
y=[ 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012, 2013, 2014, 2015, 2016,], 
title='Platform Lifespans 2000 to 2016').show()


# Filter data by last 4 years
last_period_sales = yearly_sales.sort_values('year_of_release', ascending=False).head(4)


# Filter parameters
key_platforms = ('PS2', 'X360', 'PS3', 'WII', 'DS')
key_years = [2013, 2014, 2015, 2016]


# Filter dataframe based on platforms and years
df_filtered = df[df['platform'].isin(key_platforms) & df['year_of_release'].isin(key_years)]


# total and average sales, recent years
px.bar(last_period_sales, x='year_of_release', y=['total_sales', 'average_sales'], barmode='group', 
    title='Total and Average Sales based on Year of Release').show()


# Most relevant platforms from recent time period
platform_pivot.loc[['3DS', 'PC', 'PS4', 'PSV', 'XONE'], 2000:2016]


# Filtering platforms from 2013 to 2016, by total sales in period
relevant_platforms = platform_pivot.loc[:,2013:2016].dropna(how='all')


# Filter for 5 platforms in the time period
most_relevant_platforms = relevant_platforms.loc[['3DS', 'PC', 'PS4', 'PSV', 'XONE']]
most_relevant_platforms


# platform game releases, year
px.bar(most_relevant_platforms, title='Most Relevant Platforms from 2013 to 2016', width=1200).show()


# total sales boxplot
px.box(most_relevant_platforms, title='Total Sales of Platforms from 2013 to 2016', width=1200).show()


# Take the most relevant platform dataset
df2 = most_relevant_platforms[:]


# Adding a column that totals platform sales
df2['period_total'] = df2.sum(axis=1)

C:\Users\XIX\AppData\Local\Temp\ipykernel_23028\1332112292.py:2: SettingWithCopyWarning:


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


# List of most relevant platforms
most_relevant_platforms_list = ('3DS', 'PS4', 'PC', 'XONE', 'PSV')


# Filter dataset by relevant platforms list, to get regional sales
df4 = platform_sales[platform_sales['platform'].isin(most_relevant_platforms_list)]


# Sorted by EU sales
df4.sort_values(by='eu_sales')


# Sorted by JP sales 
df4.sort_values(by='jp_sales')


# Sorted by NA sales
df4.sort_values(by='na_sales')


# Cum sum of sales in regions
df4_sum =df4.cumsum()
df4_sum = df4_sum[4:]
df4_sum


# DS ratios of total sales
ds = pd.DataFrame((df4.iloc[0,1:] / df4_sum.iloc[0, 1:]) * 100)
ds.columns= ['DS']
ds.reset_index()


# DS ratio of total sales list
ds_sales = ds['DS'].iloc[1:5].to_list()
ds_sales

[15.039642602911215, 72.42496221118549, 21.17493338798934, 12.735124760076774]


# DS regional sales numbers
ds_box = pd.DataFrame(df4.iloc[0,1:])
ds_box.columns= ['3DS']
ds_box.reset_index()


# DS regional sales numbers list
ds_box_sales = ds_box['3DS'].iloc[1:5].to_list()


# PC ratios of total sales
pc = pd.DataFrame((df4.iloc[1,1:] / df4_sum.iloc[0, 1:]) * 100)
pc.columns = ['PC']
pc.reset_index()


# PC ratio of total sales list
pc_sales = pc['PC'].iloc[1:5].to_list()


# PC regional sales numbers
pc_box = pd.DataFrame(df4.iloc[1,1:])
pc_box.columns= ['PC']
pc_box.reset_index()


# PC regional sales numbers  list
pc_box_sales = pc_box['PC'].iloc[1:5].to_list()


# PS4 ratios of total sales
ps4 = pd.DataFrame((df4.iloc[2,1:] / df4_sum.iloc[0, 1:]) * 100)
ps4.columns = ['PS4']
ps4.reset_index()


# PS4 ratios of total sales list
ps4_sales = ps4['PS4'].iloc[1:5].to_list()


# PS4 regional sales numbers
ps4_box = pd.DataFrame(df4.iloc[2,1:])
ps4_box.columns = ['PS4']
ps4_box.reset_index()


# PS4 regional sales numbers list
ps4_box_sales = ps4_box['PS4'].iloc[1:5].to_list()


# PSV ratios of total sales
psv = pd.DataFrame((df4.iloc[3,1:] / df4_sum.iloc[0, 1:]) * 100)
psv.columns = ['PSV']
psv.reset_index()


# PSV ratios of sales list
psv_sales = psv['PSV'].iloc[1:5].to_list()


# PSV sales numbers
psv_box = pd.DataFrame(df4.iloc[3,1:])
psv_box.columns = ['PSV']
psv_box.reset_index()


# PSV sales numbers list
psv_box_sales = psv_box['PSV'].iloc[1:5].to_list()


# XONE ratios of total sales
xone = pd.DataFrame((df4.iloc[4,1:] / df4_sum.iloc[0, 1:]) * 100)
xone.columns = ['XONE']
xone.reset_index()


# XONE ratios of sales list
xone_sales = xone['XONE'].iloc[1:5].to_list()


# XONE regional sales
xone_box = pd.DataFrame(df4.iloc[4,1:])
xone_box.columns = ['XONE']
xone_box.reset_index()


# XONE regional sales list
xone_box_sales = xone_box['XONE'].iloc[1:5].to_list()


x_data = ['3DS', 'PC', 'PS4', 'PSV', 'XONE']

y_data = [ds_box_sales , pc_box_sales, ps4_box_sales, psv_box_sales, xone_box_sales]

colors = ['rgba(93, 164, 214, 0.5)', 'rgba(255, 144, 14, 0.5)', 'rgba(44, 160, 101, 0.5)',
          'rgba(255, 65, 54, 0.5)', 'rgba(207, 114, 255, 0.5)']

fig = go.Figure()

for xd, yd, cls in zip(x_data, y_data, colors):
        fig.add_trace(go.Box(
            y=yd,
            name=xd,
            jitter=0.5,
            whiskerwidth=0.2,
            fillcolor=cls,
            marker_size=2,
            line_width=1)
        )

fig.update_layout(
    title='Platform Sales from 2013 to 2016',
    yaxis=dict(
        autorange=True,
        showgrid=True,
        zeroline=True,
        dtick=10,
        gridcolor='rgb(255, 255, 255)',
        gridwidth=1,
        zerolinecolor='rgb(255, 255, 255)',
        zerolinewidth=2,
    ),
    margin=dict(
        l=40,
        r=30,
        b=80,
        t=100,
    ),
    paper_bgcolor='rgb(243, 243, 243)',
    plot_bgcolor='rgb(243, 243, 243)',
    showlegend=False
)

fig.show()


# DS data merged with PC
pt_1 = ds.T.reset_index().merge(pc.T.reset_index(), how='outer')

C:\Users\XIX\anaconda3\lib\site-packages\pandas\core\reshape\merge.py:916: FutureWarning:

In a future version, the Index constructor will not infer numeric dtypes when passed object-dtype sequences (matching Series behavior)


# Merge again with PS4
pt_2 = pt_1.merge(ps4.T.reset_index(), how='outer')

C:\Users\XIX\anaconda3\lib\site-packages\pandas\core\reshape\merge.py:916: FutureWarning:

In a future version, the Index constructor will not infer numeric dtypes when passed object-dtype sequences (matching Series behavior)


# Merge again with PSV
pt_3 = pt_2.merge(psv.T.reset_index(), how='outer')

C:\Users\XIX\anaconda3\lib\site-packages\pandas\core\reshape\merge.py:916: FutureWarning:

In a future version, the Index constructor will not infer numeric dtypes when passed object-dtype sequences (matching Series behavior)


# Platform Market share percentages 
pt_final = pt_3.merge(xone.T.reset_index(), how='outer')
pt_final

C:\Users\XIX\anaconda3\lib\site-packages\pandas\core\reshape\merge.py:916: FutureWarning:

In a future version, the Index constructor will not infer numeric dtypes when passed object-dtype sequences (matching Series behavior)


# platform market share, Europe
px.pie(pt_final, values=pt_final['eu_sales'], names=['3DS', 'PC', 'PS4', 'PSV', 'XONE'], title='European Region Platform Market Share').show()


# platform market share, North America
px.pie(pt_final, values=pt_final['na_sales'], names=['3DS', 'PC', 'PS4', 'PSV', 'XONE'], title='North American Region Platform Market Share').show()


# platform market share, Japan
px.pie(pt_final, values=pt_final['jp_sales'], names=['3DS', 'PC', 'PS4', 'PSV', 'XONE'], title='Japanese Region Platform Market Share').show()


# platform market share, Other
px.pie(pt_final, values=pt_final['other_sales'], names=['3DS', 'PC', 'PS4', 'PSV', 'XONE'], title='Other Region Platform Market Share').show()


# Comparison of 3DS and PC

#Null Hypothesis

## The mean sales of 3DS and PC are the same


alpha = 0.05

results = st.ttest_ind(ds_box_sales, pc_box_sales)

print('p-value: ', results.pvalue)

if results.pvalue < alpha:
    print("We reject the null hypothesis, the means are different")
else:
    print("We can't reject the null hypothesis")

p-value:  0.9895189927444241
We can't reject the null hypothesis


# Comparison of 3DS and PS4

#Null Hypothesis

## The mean sales of 3DS and PS4 are the same


alpha = 0.05

results = st.ttest_ind(ds_box_sales, ps4_box_sales)

print('p-value: ', results.pvalue)

if results.pvalue < alpha:
    print("We reject the null hypothesis, the means are different")
else:
    print("We can't reject the null hypothesis")

p-value:  0.6936421504882218
We can't reject the null hypothesis


# Comparison of 3DS and PSV

#Null Hypothesis

## The mean sales of 3DS and PSV are the same


alpha = 0.05

results = st.ttest_ind(ds_box_sales, psv_box_sales)

print('p-value: ', results.pvalue)

if results.pvalue < alpha:
    print("We reject the null hypothesis, the means are different")
else:
    print("We can't reject the null hypothesis")

p-value:  0.03720070588836854
We reject the null hypothesis, the means are different


# Comparison of 3DS and XONE

#Null Hypothesis

## The mean sales of 3DS and XONE are the same


alpha = 0.05

results = st.ttest_ind(ds_box_sales, xone_box_sales)

print('p-value: ', results.pvalue)

if results.pvalue < alpha:
    print("We reject the null hypothesis, the means are different")
else:
    print("We can't reject the null hypothesis")

p-value:  0.41423844518684094
We can't reject the null hypothesis


# Comparison of PC and PS4

#Null Hypothesis

## The mean sales of PC and PS4 are the same


alpha = 0.05

results = st.ttest_ind(pc_box_sales, ps4_box_sales)

print('p-value: ', results.pvalue)

if results.pvalue < alpha:
    print("We reject the null hypothesis, the means are different")
else:
    print("We can't reject the null hypothesis")

p-value:  0.7458914072175874
We can't reject the null hypothesis


# Comparison of PC and PSV

#Null Hypothesis

## The mean sales of PC and PSV are the same


alpha = 0.05

results = st.ttest_ind(pc_box_sales, psv_box_sales)

print('p-value: ', results.pvalue)

if results.pvalue < alpha:
    print("We reject the null hypothesis, the means are different")
else:
    print("We can't reject the null hypothesis")

p-value:  0.17112281031665025
We can't reject the null hypothesis


# Comparison of PC and XONE

#Null Hypothesis

## The mean sales of PC and XONE are the same


alpha = 0.05

results = st.ttest_ind(pc_box_sales, xone_box_sales)

print('p-value: ', results.pvalue)

if results.pvalue < alpha:
    print("We reject the null hypothesis, the means are different")
else:
    print("We can't reject the null hypothesis")

p-value:  0.5536327634712646
We can't reject the null hypothesis


# Comparison of PS4 and PSV

#Null Hypothesis

## The mean sales of PS4 and PSV are the same


alpha = 0.05

results = st.ttest_ind(ps4_box_sales, psv_box_sales)

print('p-value: ', results.pvalue)

if results.pvalue < alpha:
    print("We reject the null hypothesis, the means are different")
else:
    print("We can't reject the null hypothesis")

p-value:  0.06275189115886726
We can't reject the null hypothesis


# Comparison of PS4 and XONE

#Null Hypothesis

## The mean sales of PS4 and XONE are the same


alpha = 0.05

results = st.ttest_ind(ps4_box_sales, xone_box_sales)

print('p-value: ', results.pvalue)

if results.pvalue < alpha:
    print("We reject the null hypothesis, the means are different")
else:
    print("We can't reject the null hypothesis")

p-value:  0.3132900002296261
We can't reject the null hypothesis


# Comparison of PSV and XONE

#Null Hypothesis

## The mean sales of PSV and XONE are the same


alpha = 0.05

results = st.ttest_ind(psv_box_sales, xone_box_sales)

print('p-value: ', results.pvalue)

if results.pvalue < alpha:
    print("We reject the null hypothesis, the means are different")
else:
    print("We can't reject the null hypothesis")

p-value:  0.2565719484054285
We can't reject the null hypothesis


# Filtering data frame for most relevant platforms and key years 2013-2016
df3 = df[df['platform'].isin(most_relevant_platforms_list) & df['year_of_release'].isin(key_years)]
df3.head()


# Filter by EU region sales
df_eu = df4[['platform', 'eu_sales']]


# Convert EU data to list
df_eu_sales = df_eu['eu_sales'].to_list()


# Filter by JP region sales
df_jp = df4[['platform', 'jp_sales']]


# Convert JP data to list
df_jp_sales = df_jp['jp_sales'].to_list()


# Filter for NA region sales
df_na = df4[['platform', 'na_sales']]


# Convert NA sales to list
df_na_sales = df_na['na_sales'].to_list()


# Filter for Other region sales
df_other = df4[['platform', 'other_sales']]


# Convert other region sales to list
df_other_sales = df_other['other_sales'].to_list()


x_data = ['Europe', 'Japan', 'North America', 'Other']

y_data = [df_eu_sales , df_jp_sales, df_na_sales, df_other_sales]

colors = ['rgba(44, 160, 101, 0.5)',
          'rgba(255, 65, 54, 0.5)', 'rgba(207, 114, 255, 0.5)', 'rgba(127, 96, 0, 0.5)']

fig = go.Figure()

for xd, yd, cls in zip(x_data, y_data, colors):
        fig.add_trace(go.Box(
            y=yd,
            name=xd,
            jitter=0.5,
            whiskerwidth=0.2,
            fillcolor=cls,
            marker_size=2,
            line_width=1)
        )

fig.update_layout(
    title='Regional Sales from 2013 to 2016',
    yaxis=dict(
        autorange=True,
        showgrid=True,
        zeroline=True,
        dtick=10,
        gridcolor='rgb(255, 255, 255)',
        gridwidth=1,
        zerolinecolor='rgb(255, 255, 255)',
        zerolinewidth=2,
    ),
    margin=dict(
        l=40,
        r=30,
        b=80,
        t=100,
    ),
    paper_bgcolor='rgb(243, 243, 243)',
    plot_bgcolor='rgb(243, 243, 243)',
    showlegend=False
)

fig.show()


# Comparison of EU and JP

#Null Hypothesis

## The mean sales of EU and JP are the same


alpha = 0.05

results = st.ttest_ind(df_eu_sales, df_jp_sales)

print('p-value: ', results.pvalue)

if results.pvalue < alpha:
    print("We reject the null hypothesis, the means are different")
else:
    print("We can't reject the null hypothesis")

p-value:  0.12795336461497347
We can't reject the null hypothesis


# Comparison of EU and NA

#Null Hypothesis

## The mean sales of EU and NA are the same


alpha = 0.05

results = st.ttest_ind(df_eu_sales, df_na_sales)

print('p-value: ', results.pvalue)

if results.pvalue < alpha:
    print("We reject the null hypothesis, the means are different")
else:
    print("We can't reject the null hypothesis")

p-value:  0.9139153517721482
We can't reject the null hypothesis


# Comparison of EU and Other

#Null Hypothesis

## The mean sales of EU and Other are the same


alpha = 0.05

results = st.ttest_ind(df_eu_sales, df_other_sales)

print('p-value: ', results.pvalue)

if results.pvalue < alpha:
    print("We reject the null hypothesis, the means are different")
else:
    print("We can't reject the null hypothesis")

p-value:  0.05156498857359916
We can't reject the null hypothesis


# Comparison of JP and NA

#Null Hypothesis

## The mean sales of JP and NA are the same


alpha = 0.05

results = st.ttest_ind(df_jp_sales, df_na_sales)

print('p-value: ', results.pvalue)

if results.pvalue < alpha:
    print("We reject the null hypothesis, the means are different")
else:
    print("We can't reject the null hypothesis")

p-value:  0.08131759488099968
We can't reject the null hypothesis


# Comparison of JP and NA

#Null Hypothesis

## The mean sales of JP and NA are the same


alpha = 0.05

results = st.ttest_ind(df_jp_sales, df_na_sales)

print('p-value: ', results.pvalue)

if results.pvalue < alpha:
    print("We reject the null hypothesis, the means are different")
else:
    print("We can't reject the null hypothesis")

p-value:  0.08131759488099968
We can't reject the null hypothesis


# Comparison of NA and Other

#Null Hypothesis

## The mean sales of NA and Other are the same


alpha = 0.05

results = st.ttest_ind(df_na_sales, df_other_sales)

print('p-value: ', results.pvalue)

if results.pvalue < alpha:
    print("We reject the null hypothesis, the means are different")
else:
    print("We can't reject the null hypothesis")

p-value:  0.014543290639873504
We reject the null hypothesis, the means are different


# Creating a filter
ps4_list = ['PS4']


# filtering data frame for PS4 
df_ps4 = df3[df3['platform'].isin(ps4_list)]


# Sorting the resulting data set by scores and total sales
ps4_score_sales = df_ps4[['user_score', 'critic_score','average_score', 'total_sales']].sort_values(by='total_sales', ascending=False)


# correlation between scores and total sales
ps4_score_sales.corr()


# sales based on review scores
px.scatter(ps4_score_sales, x=['user_score', 'critic_score', 'average_score'], 
y='total_sales', trendline='ols', trendline_color_override='black',
title='PS4 Total Sales Based on Review Scores').show()


# Filter by 2013 to 2016
df5 = df[df['year_of_release'].isin(key_years)]


# group the platforms by sales
df6 = df5.groupby('platform')['eu_sales', 'jp_sales', 'na_sales', 'other_sales', 'average_sales', 'total_sales'].sum()

C:\Users\XIX\AppData\Local\Temp\ipykernel_23028\3857988385.py:2: FutureWarning:

Indexing with multiple keys (implicitly converted to a tuple of keys) will be deprecated, use a list instead.


# Sort platforms by highest total sales
df6.sort_values(by='total_sales',ascending=False).head(10)


# Top games dataset
top_five_games


# Top 20 Games by total sales from 2013-2016
df5.pivot_table(index='name', values=['total_sales', 'average_sales'], 
aggfunc='sum').reset_index().sort_values(by='total_sales', ascending=False).head(20)


# Pivot table of games and their platform, aggregated by total sales
df7 = df5.pivot_table(index=['name', 'platform'], values='total_sales', aggfunc='sum').reset_index()


# Creating filters
cross_games = ('grand theft auto v', 'call of duty: ghosts', 'minecraft', 'fifa 15')
cross_platforms = ('PS4', 'XONE', 'WIIU', 'PC', '3DS')


# Applying filter to our data set
df_cross = df7[df7['name'].isin(cross_games) & df7['platform'].isin(cross_platforms)]


# Seeing which platforms have the most games in our selection
df7[df7['platform'].isin(cross_platforms)].sort_values(by=['name', 'platform']).value_counts(subset='platform')

platform
PS4     392
3DS     303
XONE    247
PC      189
WIIU    115
dtype: int64


# Making a COD data set
cod = df_cross[df_cross['name']=='call of duty: ghosts']


#making a GTA data set
gta = df_cross[df_cross['name']=='grand theft auto v']


px.bar(cod, x='platform', y='total_sales', color='platform', title='COD: Ghost Platform Sales').show()


# GTA V sales
px.bar(gta, x='platform', y='total_sales', color='platform', title='GTA V Platform Sales').show()


# Sorting data by genre
genres = df5.groupby('genre')['total_sales'].sum().sort_values(ascending=False).reset_index()


# sales by genre
px.bar(genres, x='genre', y='total_sales', color='genre', title='Total Sales by Genre 2013-216').show()


# creating a rating pivot table 
df_rating = df5.pivot_table(index=['name', 'platform', 'rating'], values='total_sales', aggfunc='sum').reset_index()


# Indexing for the total sales based on rating
df_rating_sales = df_rating.groupby('rating')['total_sales'].sum().sort_values(ascending=False).reset_index()


# game rating sales
px.bar(df_rating_sales, x='rating', y='total_sales', color='rating', title='Game Rating Sales 2013-2016').show()


# Filtering dataframe for key years 2013 to 2016
df8 = df[df['year_of_release'].isin(key_years)]


# grouping result by platform, then looking at sales data
df9 = df8.groupby('platform')[['total_sales','eu_sales', 'jp_sales', 'na_sales', 'other_sales']].sum().reset_index()


# platfrom global market share
px.pie(df9, values='total_sales', names='platform', title='Global Market Share by Platform').show()


# European top platforms
df_top_eu = df9.sort_values(by='eu_sales', ascending=False).head()
df_top_eu = df_top_eu[['platform','eu_sales']].reset_index()


# Cum sum of sales in europe
df_top_eu_sum =df_top_eu.cumsum()
df_top_eu_sum = df_top_eu_sum[4:]
df_top_eu_sum = df_top_eu_sum['eu_sales'].to_list()


# multiply market share by 100
df_top_eu['market_share']= (df_top_eu['eu_sales'] / df_top_eu_sum) *100


# european market share
px.pie(df_top_eu, values='market_share', names='platform', title='European Region Platform Market Shares').show()


# top japanese platforms
df_top_jp = df9.sort_values(by='jp_sales', ascending=False).head()
df_top_jp = df_top_jp[['platform','jp_sales']].reset_index()


# Cum sum of sales in japan
df_top_jp_sum =df_top_jp.cumsum()
df_top_jp_sum = df_top_jp_sum[4:]
df_top_jp_sum = df_top_jp_sum['jp_sales'].to_list()


# market share percentage
df_top_jp['market_share']= (df_top_jp['jp_sales'] / df_top_jp_sum) *100


# Japan platform market share
px.pie(df_top_jp, values='market_share', names='platform', title='Japanese Region Platform Market Shares').show()


# Top north american platforms
df_top_na = df9.sort_values(by='na_sales', ascending=False).head()
df_top_na = df_top_na[['platform','na_sales']].reset_index()


# Cum sum of sales in north america
df_top_na_sum =df_top_na.cumsum()
df_top_na_sum = df_top_na_sum[4:]
df_top_na_sum = df_top_na_sum['na_sales'].to_list()


# market share percentage
df_top_na['market_share']= (df_top_na['na_sales'] / df_top_na_sum) *100


# north america platform market share
px.pie(df_top_na, values='market_share', names='platform', title='North American Region Platform Market Shares').show()


# other region top platform sales
df_top_other = df9.sort_values(by='other_sales', ascending=False).head()
df_top_other = df_top_other[['platform','other_sales']].reset_index()


# Cum sum of sales in north america
df_top_other_sum =df_top_other.cumsum()
df_top_other_sum = df_top_other_sum[4:]
df_top_other_sum = df_top_other_sum['other_sales'].to_list()


# market share percentage
df_top_other['market_share']= (df_top_other['other_sales'] / df_top_other_sum) *100


# other region market share
px.pie(df_top_other, values='market_share', names='platform', title='Other Region Platform Market Shares').show()


# Categorizing datafarame by genre and sum of sales
df10 = df8.groupby('genre')[['total_sales','eu_sales', 'jp_sales', 'na_sales', 'other_sales']].sum().reset_index()


# market share, genre
px.pie(df10, values='total_sales', names='genre', title='Global Market Share by Genre').show()


# Top 5 european region genres
df_genre_eu = df10.sort_values(by='eu_sales', ascending=False).head()
df_genre_eu = df_genre_eu[['genre','eu_sales']].reset_index()


# Cum sum of sales in europe
df_genre_eu_sum =df_genre_eu.cumsum()
df_genre_eu_sum = df_genre_eu_sum[4:]
df_genre_eu_sum = df_genre_eu_sum['eu_sales'].to_list()


# Creating market share column
df_genre_eu['market_share']= (df_genre_eu['eu_sales'] / df_genre_eu_sum) *100


# European genre market share
px.pie(df_genre_eu, values='market_share', names='genre', title='European Region Genre Market Shares').show()


# Top 5 japanese region genres
df_genre_jp = df10.sort_values(by='jp_sales', ascending=False).head()
df_genre_jp = df_genre_jp[['genre','jp_sales']].reset_index()


# Cum sum of sales in japan
df_genre_jp_sum =df_genre_jp.cumsum()
df_genre_jp_sum = df_genre_jp_sum[4:]
df_genre_jp_sum = df_genre_jp_sum['jp_sales'].to_list()


# Creating market share column
df_genre_jp['market_share']= (df_genre_jp['jp_sales'] / df_genre_jp_sum) *100


# Japan genre market share
px.pie(df_genre_jp, values='market_share', names='genre', title='Japanese Region Genre Market Shares')


# Top 5 north american region genres
df_genre_na = df10.sort_values(by='na_sales', ascending=False).head()
df_genre_na = df_genre_na[['genre','na_sales']].reset_index()


# Cum sum of sales in japan
df_genre_na_sum =df_genre_na.cumsum()
df_genre_na_sum = df_genre_na_sum[4:]
df_genre_na_sum = df_genre_na_sum['na_sales'].to_list()


# Creating market share column
df_genre_na['market_share']= (df_genre_na['na_sales'] / df_genre_na_sum) *100


# North America genre market share
px.pie(df_genre_na, values='market_share', names='genre', title='North American Region Genre Market Shares').show()


# Top 5 other region genres
df_genre_other = df10.sort_values(by='other_sales', ascending=False).head()
df_genre_other = df_genre_other[['genre','other_sales']].reset_index()


# Cum sum of sales in japan
df_genre_other_sum =df_genre_other.cumsum()
df_genre_other_sum = df_genre_other_sum[4:]
df_genre_other_sum = df_genre_other_sum['other_sales'].to_list()


# Creating market share column
df_genre_other['market_share']= (df_genre_other['other_sales'] / df_genre_other_sum) *100


# Other genre market share
px.pie(df_genre_other, values='market_share', names='genre', title='Other Region Genre Market Shares').show()


# creating pivot based on ratings, then collect sales
df_esrb = df5.pivot_table(index=['name', 'rating'], 
values=['eu_sales','jp_sales', 'na_sales', 'other_sales','total_sales'], 
aggfunc='sum').reset_index()


# grouping by rating, taking sum of sales data
df_esrb_sales = df_esrb.groupby('rating')[['eu_sales','jp_sales', 'na_sales', 'other_sales']].sum().reset_index()


# sales based on rating
px.bar(df_esrb_sales, x='rating', y=['eu_sales','jp_sales', 'na_sales', 'other_sales'], 
    barmode='group', title='Regional Sales Based on ESRB Rating').show()


# list of platfroms to filter 
xone_list = ['XONE']
pc_list = ['PC']


# Filtered dataframe
df_xone_pc = df[df['platform'].isin(xone_list) | df['platform'].isin(pc_list)].dropna()


# Filter again by XONE
xone_user_list = df_xone_pc[df_xone_pc['platform'].isin(xone_list)]


# List of user scores
xone_user_list = xone_user_list['user_score'].to_list()


# Filter again by PC
pc_user_list = df_xone_pc[df_xone_pc['platform'].isin(pc_list)]


# Convert values to list
pc_user_list = pc_user_list['user_score'].to_list()


# Comparison of XONE and PC user ratings

#Null Hypothesis

## The mean user ratings of XONE and PC are the same


alpha = 0.05

results = st.ttest_ind(xone_user_list, pc_user_list)

print('p-value: ', results.pvalue)

if results.pvalue < alpha:
    print("We reject the null hypothesis, the means are different")
else:
    print("We can't reject the null hypothesis")

p-value:  1.3810936500327673e-05
We reject the null hypothesis, the means are different


# Creating action and sports filter
action = ['Action']
sports = ['Sports']


# Filtered dataframe
df_action_sports = df[df['genre'].isin(action) | df['genre'].isin(sports)].dropna()


# Filter again by action
action_user_list = df_action_sports[df_action_sports['genre'].isin(action)]


# List of user scores
action_user_list = action_user_list['user_score'].to_list()


# Filter again by action
sports_user_list = df_action_sports[df_action_sports['genre'].isin(sports)]


# List of user scores
sports_user_list = sports_user_list['user_score'].to_list()


# Comparison of Action and Sports user ratings

#Null Hypothesis

## The mean user ratings of Action and Sports are the same


alpha = 0.05

results = st.ttest_ind(action_user_list, sports_user_list)

print('p-value: ', results.pvalue)

if results.pvalue < alpha:
    print("We reject the null hypothesis, the means are different")
else:
    print("We can't reject the null hypothesis")

p-value:  5.896027231289071e-06
We reject the null hypothesis, the means are different

	year_of_release	na_sales	eu_sales	jp_sales	other_sales	critic_score
count	16446.000000	16715.000000	16715.000000	16715.000000	16715.000000	8137.000000
mean	2006.484616	0.263377	0.145060	0.077617	0.047342	68.967679
std	5.877050	0.813604	0.503339	0.308853	0.186731	13.938165
min	1980.000000	0.000000	0.000000	0.000000	0.000000	13.000000
25%	2003.000000	0.000000	0.000000	0.000000	0.000000	60.000000
50%	2007.000000	0.080000	0.020000	0.000000	0.010000	71.000000
75%	2010.000000	0.240000	0.110000	0.040000	0.030000	79.000000
max	2016.000000	41.360000	28.960000	10.220000	10.570000	98.000000

	critic_score	eu_sales	jp_sales	na_sales	total_sales
critic_score	1.000000	0.221330	0.153510	0.240150	0.245414
eu_sales	0.221330	1.000000	0.435892	0.766545	0.901673
jp_sales	0.153510	0.435892	1.000000	0.451159	0.613303
na_sales	0.240150	0.766545	0.451159	1.000000	0.941241
total_sales	0.245414	0.901673	0.613303	0.941241	1.000000

	user_score	eu_sales	jp_sales	na_sales	total_sales
user_score	1.000000	0.144300	0.142276	0.153533	0.165091
eu_sales	0.144300	1.000000	0.435892	0.766545	0.901673
jp_sales	0.142276	0.435892	1.000000	0.451159	0.613303
na_sales	0.153533	0.766545	0.451159	1.000000	0.941241
total_sales	0.165091	0.901673	0.613303	0.941241	1.000000

year_of_release	2013	2014	2015	2016
platform
3DS	91.0	80.0	86.0	46.0
PC	38.0	47.0	50.0	54.0
PS4	16.0	75.0	137.0	164.0
PSV	63.0	100.0	110.0	85.0
XONE	19.0	61.0	80.0	87.0

	platform	average_sales	eu_sales	jp_sales	na_sales	other_sales	total_sales
20	PSV	13.4525	13.07	21.84	12.47	6.43	53.81
30	XONE	39.8300	51.59	0.34	93.12	14.27	159.32
2	3DS	64.4525	61.27	100.62	82.65	13.27	257.81
13	PC	63.9400	140.37	0.17	93.34	21.88	255.76
18	PS4	78.5350	141.09	15.96	108.74	48.35	314.14

	Name	Platform	Year_of_Release	Genre	NA_sales	EU_sales	JP_sales	Other_sales	Critic_Score	User_Score	Rating
0	Wii Sports	Wii	2006.0	Sports	41.36	28.96	3.77	8.45	76.0	8	E
1	Super Mario Bros.	NES	1985.0	Platform	29.08	3.58	6.81	0.77	NaN	NaN	NaN
2	Mario Kart Wii	Wii	2008.0	Racing	15.68	12.76	3.79	3.29	82.0	8.3	E
3	Wii Sports Resort	Wii	2009.0	Sports	15.61	10.93	3.28	2.95	80.0	8	E
4	Pokemon Red/Pokemon Blue	GB	1996.0	Role-Playing	11.27	8.89	10.22	1.00	NaN	NaN	NaN

	name	platform	year_of_release	genre	na_sales	eu_sales	jp_sales	other_sales	critic_score	user_score	rating
1591	Need for Speed: Most Wanted	X360	2005.0	Racing	1.0	0.13	0.02	0.10	83.0	8.5	T
4127	Sonic the Hedgehog	PS3	NaN	Platform	0.0	0.48	0.00	0.00	43.0	4.1	E10+
11715	Need for Speed: Most Wanted	PC	2012.0	Racing	0.0	0.06	0.00	0.02	82.0	8.5	T
14244	NaN	GEN	1993.0	NaN	0.0	0.00	0.03	0.00	NaN	NaN	NaN
16230	Madden NFL 13	PS3	2012.0	Sports	0.0	0.01	0.00	0.00	83.0	5.5	E

	name	platform	year_of_release	genre	na_sales	eu_sales	jp_sales	other_sales	critic_score	user_score	rating
257	Sonic the Hedgehog	GEN	1991.0	Platform	3.03	0.91	0.26	0.13	NaN	NaN	NaN
1745	Sonic the Hedgehog	PS3	2006.0	Platform	0.41	0.06	0.04	0.66	43.0	4.1	E10+
1996	Sonic the Hedgehog	X360	2006.0	Platform	0.44	0.48	0.00	0.11	46.0	4.4	E10+
4127	Sonic the Hedgehog	PS3	NaN	Platform	0.00	0.48	0.00	0.00	43.0	4.1	E10+

	name	platform	year_of_release	genre	na_sales	eu_sales	jp_sales	other_sales	critic_score	user_score	rating
183	Madden NFL 2004	PS2	NaN	Sports	4.26	0.26	0.01	0.71	94.0	8.5	E
377	FIFA Soccer 2004	PS2	NaN	Sports	0.59	2.36	0.04	0.51	84.0	6.4	E
456	LEGO Batman: The Videogame	Wii	NaN	Action	1.80	0.97	0.00	0.29	74.0	7.9	E10+
475	wwe Smackdown vs. Raw 2006	PS2	NaN	Fighting	1.57	1.02	0.00	0.41	NaN	NaN	NaN
609	Space Invaders	2600	NaN	Shooter	2.36	0.14	0.00	0.03	NaN	NaN	NaN
...	...	...	...	...	...	...	...	...	...	...	...
16373	PDC World Championship Darts 2008	PSP	NaN	Sports	0.01	0.00	0.00	0.00	43.0	0.0	E10+
16405	Freaky Flyers	GC	NaN	Racing	0.01	0.00	0.00	0.00	69.0	6.5	T
16448	Inversion	PC	NaN	Shooter	0.01	0.00	0.00	0.00	59.0	6.7	M
16458	Hakuouki: Shinsengumi Kitan	PS3	NaN	Adventure	0.01	0.00	0.00	0.00	NaN	NaN	NaN
16522	Virtua Quest	GC	NaN	Role-Playing	0.01	0.00	0.00	0.00	55.0	5.5	T

	platform	rating	0
0	DS	E	865
1	PS2	T	567
2	PS2	E	541
3	Wii	E	493
4	GBA	E	419
...	...	...	...
70	GC	EC	1
71	PC	RP	1
72	PS	K-A	1
73	PS2	EC	1
74	DC	M	1

year_of_release	1985	2000	2001	2002	2003	2004	2005	2006	2007	2008	2009	2010	2011	2012	2013	2014	2015	2016
platform
PS2	NaN	82.0	185.0	280.0	256.0	259.0	260.0	259.0	214.0	191.0	96.0	38.0	7.0	NaN	NaN	NaN	NaN	NaN
PS3	NaN	NaN	NaN	NaN	NaN	NaN	NaN	27.0	90.0	138.0	162.0	181.0	215.0	147.0	126.0	108.0	73.0	38.0
WII	NaN	NaN	NaN	NaN	NaN	NaN	NaN	44.0	185.0	282.0	325.0	253.0	143.0	31.0	12.0	6.0	4.0	1.0
X360	NaN	NaN	NaN	NaN	NaN	NaN	18.0	93.0	123.0	146.0	172.0	182.0	206.0	106.0	75.0	63.0	35.0	13.0
DS	1.0	NaN	NaN	NaN	NaN	23.0	118.0	201.0	376.0	492.0	403.0	323.0	153.0	23.0	8.0	NaN	NaN	NaN

year_of_release	2000	2001	2002	2003	2004	2005	2006	2007	2008	2009	2010	2011	2012	2013	2014	2015	2016
platform
3DS	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	116.0	93.0	91.0	80.0	86.0	46.0
PC	7.0	15.0	19.0	33.0	30.0	37.0	52.0	62.0	76.0	107.0	90.0	139.0	61.0	38.0	47.0	50.0	54.0
PS4	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	16.0	75.0	137.0	164.0
PSV	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	18.0	53.0	63.0	100.0	110.0	85.0
XONE	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	19.0	61.0	80.0	87.0

	index	DS
0	average_sales	24.769417
1	eu_sales	15.039643
2	jp_sales	72.424962
3	na_sales	21.174933
4	other_sales	12.735125
5	total_sales	24.769417

	index	PC
0	average_sales	24.572461
1	eu_sales	34.455927
2	jp_sales	0.122364
3	na_sales	23.913712
4	other_sales	20.998081
5	total_sales	24.572461

	index	PC
0	average_sales	63.94
1	eu_sales	140.37
2	jp_sales	0.17
3	na_sales	93.34
4	other_sales	21.88
5	total_sales	255.76

	user_score	critic_score	average_score	total_sales
user_score	1.000000	0.520752	0.922750	0.023279
critic_score	0.520752	1.000000	0.839231	0.406568
average_score	0.922750	0.839231	1.000000	0.201518
total_sales	0.023279	0.406568	0.201518	1.000000

	eu_sales	jp_sales	na_sales	other_sales	average_sales	total_sales
platform
PS4	141.09	15.96	108.74	48.35	78.5350	314.14
PS3	67.81	23.35	63.50	26.77	45.3575	181.43
XONE	51.59	0.34	93.12	14.27	39.8300	159.32
3DS	30.96	67.81	38.20	6.28	35.8125	143.25
X360	42.52	0.51	81.66	12.11	34.2000	136.80
WIIU	19.85	10.88	29.21	4.69	16.1575	64.63
PC	25.36	0.00	11.11	2.96	9.8575	39.43
PSV	6.10	18.59	5.04	3.26	8.2475	32.99
WII	5.93	0.05	6.56	1.12	3.4150	13.66
PSP	0.17	3.29	0.00	0.04	0.8750	3.50

	name	average_sales	total_sales
10927	wii sports	20.6350	82.54
3691	grand theft auto v	14.1450	56.58
9300	super mario bros.	11.3275	45.31
9660	tetris	8.9600	35.84
5501	mario kart wii	8.8800	35.52

	platform	rating	0
0	DS	E	865
1	PS2	T	567
2	PS2	E	541
3	Wii	E	493
4	GBA	E	419
...	...	...	...
70	GC	EC	1
71	PC	RP	1
72	PS	K-A	1
73	PS2	EC	1
74	DC	M	1

	index	PS4
0	average_sales	30.181392
1	eu_sales	34.632662
2	jp_sales	11.4878
3	na_sales	27.859192
4	other_sales	46.401152
5	total_sales	30.181392

	index	PS4
0	average_sales	78.535
1	eu_sales	141.09
2	jp_sales	15.96
3	na_sales	108.74
4	other_sales	48.35
5	total_sales	314.14

	index	PSV
0	average_sales	5.169863
1	eu_sales	3.208228
2	jp_sales	15.720147
3	na_sales	3.194815
4	other_sales	6.170825
5	total_sales	5.169863

	index	XONE
0	average_sales	15.306868
1	eu_sales	12.663541
2	jp_sales	0.244728
3	na_sales	23.857348
4	other_sales	13.694818
5	total_sales	15.306868

	index	XONE
0	average_sales	39.83
1	eu_sales	51.59
2	jp_sales	0.34
3	na_sales	93.12
4	other_sales	14.27
5	total_sales	159.32

	name	platform	year_of_release	genre	na_sales	eu_sales	jp_sales	other_sales	critic_score	user_score	rating	average_score	total_sales	average_sales
31	call of duty: black ops 3	PS4	2015	Shooter	6.03	5.86	0.36	2.38	NaN	NaN	NaN	NaN	14.63	3.6575
33	pokemon x/pokemon y	3DS	2013	Role-Playing	5.28	4.19	4.35	0.78	NaN	NaN	NaN	NaN	14.60	3.6500
42	grand theft auto v	PS4	2014	Action	3.96	6.31	0.38	1.97	9.7	8.3	M	9.00	12.62	3.1550
47	pokemon omega ruby/pokemon alpha sapphire	3DS	2014	Role-Playing	4.35	3.49	3.10	0.74	NaN	NaN	NaN	NaN	11.68	2.9200
77	fifa 16	PS4	2015	Sports	1.12	6.12	0.06	1.28	8.2	4.3	E	6.25	8.58	2.1450

	platform	rating	0
0	DS	E	865
1	PS2	T	567
2	PS2	E	541
3	Wii	E	493
4	GBA	E	419
...	...	...	...
70	GC	EC	1
71	PC	RP	1
72	PS	K-A	1
73	PS2	EC	1
74	DC	M	1