# importing pandas
import pandas as pd


# read dataset and store to df
df = pd.read_csv('datasets/music_project_en.csv')


# ensure proper reading of file
df.head()


# description of the data
df.describe()


# obtaining the first 10 rows from the df table
df.head(10)


# obtaining general information about the data in df
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 65079 entries, 0 to 65078
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0     userID  65079 non-null  object
 1   Track     63736 non-null  object
 2   artist    57512 non-null  object
 3   genre     63881 non-null  object
 4     City    65079 non-null  object
 5   time      65079 non-null  object
 6   Day       65079 non-null  object
dtypes: object(7)
memory usage: 3.5+ MB


# the list of column names in the df table
df.columns

Index(['  userID', 'Track', 'artist', 'genre', '  City  ', 'time', 'Day'], dtype='object')


# renaming columns
df = df.rename(columns={'  userID' : 'user_id', 'Track' : 'track', '  City  ' : 'city', 'Day' : 'day'})


# checking result: the list of column names
df.columns

Index(['user_id', 'track', 'artist', 'genre', 'city', 'time', 'day'], dtype='object')


# calculating missing values
df.isna().sum()

user_id       0
track      1343
artist     7567
genre      1198
city          0
time          0
day           0
dtype: int64


# looping over column names and replacing missing values with 'unknown'
columns_to_replace = df.columns

for column in columns_to_replace :
    df[column].fillna('unknown', inplace=True)


#checking to see if NaN still exists in data set
df.head(16)


# counting missing values
df.isna().sum()

user_id    0
track      0
artist     0
genre      0
city       0
time       0
day        0
dtype: int64


# counting clear duplicates
print(df.duplicated().sum())

3826


# removing obvious duplicates
df= df.drop_duplicates()


# checking for duplicates
df.duplicated().sum()

0


# looking at tail of data
df.tail()


# resetting index after dropping rows, look at tail
df.reset_index(drop=True).tail()


# viewing unique genre names
sorted_df = df.sort_values(by='genre', ascending=True)
print(sorted_df['genre'].unique())

['acid' 'acoustic' 'action' 'adult' 'africa' 'afrikaans' 'alternative'
 'ambient' 'americana' 'animated' 'anime' 'arabesk' 'arabic' 'arena'
 'argentinetango' 'art' 'audiobook' 'avantgarde' 'axé' 'baile' 'balkan'
 'beats' 'bigroom' 'black' 'bluegrass' 'blues' 'bollywood' 'bossa'
 'brazilian' 'breakbeat' 'breaks' 'broadway' 'cantautori' 'cantopop'
 'canzone' 'caribbean' 'caucasian' 'celtic' 'chamber' 'children' 'chill'
 'chinese' 'choral' 'christian' 'christmas' 'classical' 'classicmetal'
 'club' 'colombian' 'comedy' 'conjazz' 'contemporary' 'country' 'cuban'
 'dance' 'dancehall' 'dancepop' 'dark' 'death' 'deep' 'deutschrock'
 'deutschspr' 'dirty' 'disco' 'dnb' 'documentary' 'downbeat' 'downtempo'
 'drum' 'dub' 'dubstep' 'eastern' 'easy' 'electronic' 'electropop' 'emo'
 'entehno' 'epicmetal' 'estrada' 'ethnic' 'eurofolk' 'european'
 'experimental' 'extrememetal' 'fado' 'film' 'fitness' 'flamenco' 'folk'
 'folklore' 'folkmetal' 'folkrock' 'folktronica' 'forró' 'frankreich'
 'französisch' 'french' 'funk' 'future' 'gangsta' 'garage' 'german'
 'ghazal' 'gitarre' 'glitch' 'gospel' 'gothic' 'grime' 'grunge' 'gypsy'
 'handsup' "hard'n'heavy" 'hardcore' 'hardstyle' 'hardtechno' 'hip'
 'hip-hop' 'hiphop' 'historisch' 'holiday' 'hop' 'horror' 'house' 'idm'
 'independent' 'indian' 'indie' 'indipop' 'industrial' 'inspirational'
 'instrumental' 'international' 'irish' 'jam' 'japanese' 'jazz' 'jewish'
 'jpop' 'jungle' 'k-pop' 'karadeniz' 'karaoke' 'kayokyoku' 'korean'
 'laiko' 'latin' 'latino' 'leftfield' 'local' 'lounge' 'loungeelectronic'
 'lovers' 'malaysian' 'mandopop' 'marschmusik' 'meditative'
 'mediterranean' 'melodic' 'metal' 'metalcore' 'mexican' 'middle'
 'minimal' 'miscellaneous' 'modern' 'mood' 'mpb' 'muslim' 'native'
 'neoklassik' 'neue' 'new' 'newage' 'newwave' 'nu' 'nujazz' 'numetal'
 'oceania' 'old' 'opera' 'orchestral' 'other' 'piano' 'pop'
 'popelectronic' 'popeurodance' 'post' 'posthardcore' 'postrock' 'power'
 'progmetal' 'progressive' 'psychedelic' 'punjabi' 'punk' 'quebecois'
 'ragga' 'ram' 'rancheras' 'rap' 'rave' 'reggae' 'reggaeton' 'regional'
 'relax' 'religious' 'retro' 'rhythm' 'rnb' 'rnr' 'rock' 'rockabilly'
 'romance' 'roots' 'ruspop' 'rusrap' 'rusrock' 'salsa' 'samba' 'schlager'
 'self' 'sertanejo' 'shoegazing' 'showtunes' 'singer' 'ska' 'slow'
 'smooth' 'soul' 'soulful' 'sound' 'soundtrack' 'southern' 'specialty'
 'speech' 'spiritual' 'sport' 'stonerrock' 'surf' 'swing' 'synthpop'
 'sängerportrait' 'tango' 'tanzorchester' 'taraftar' 'tech' 'techno'
 'thrash' 'top' 'traditional' 'tradjazz' 'trance' 'tribal' 'trip'
 'triphop' 'tropical' 'türk' 'türkçe' 'unknown' 'urban' 'uzbek' 'variété'
 'vi' 'videogame' 'vocal' 'western' 'world' 'worldbeat' 'ïîï']


# function for replacing implicit duplicates

def replace_wrong_genres(wrong_genres, correct_genres) :
    for wrong_genre in wrong_genres :  
        df['genre'] =df['genre'].replace(wrong_genre, correct_genres)


# removing implicit duplicates
duplicates = [['hip'], ['hop'], ['hip-hop']]
new = 'hiphop'

replace_wrong_genres(duplicates, new)
print(df)

        user_id                              track            artist  \
0      FFB692EC                  Kamigata To Boots  The Mass Missile   
1      55204538        Delayed Because of Accident  Andreas Rönnberg   
2        20EC38                  Funiculì funiculà       Mario Lanza   
3      A3DD03C9              Dragons in the Sunset        Fire + Ice   
4      E2DC1FAE                        Soul People        Space Echo   
...         ...                                ...               ...   
65074  729CBB09                            My Name            McLean   
65075  D08D4A55  Maybe One Day (feat. Black Spade)       Blu & Exile   
65076  C5E3A0D5                          Jalopiina           unknown   
65077  321D0506                      Freight Train     Chas McDevitt   
65078  3A64EF84          Tell Me Sweet Little Lies      Monica Lopez   

            genre         city      time        day  
0            rock  Shelbyville  20:28:33  Wednesday  
1            rock  Springfield  14:07:09     Friday  
2             pop  Shelbyville  20:58:07  Wednesday  
3            folk  Shelbyville  08:37:09     Monday  
4           dance  Springfield  08:34:34     Monday  
...           ...          ...       ...        ...  
65074         rnb  Springfield  13:32:28  Wednesday  
65075      hiphop  Shelbyville  10:00:00     Monday  
65076  industrial  Springfield  20:09:26     Friday  
65077        rock  Springfield  21:43:59     Friday  
65078     country  Springfield  21:59:46     Friday  

[61253 rows x 7 columns]


# Checking for implicit duplicates
df = df.sort_values(by= 'genre', ascending=True) #sorting df by genre, alphabetical order, 
print(df['genre'].unique()) #see hop changed to hiphop

['acid' 'acoustic' 'action' 'adult' 'africa' 'afrikaans' 'alternative'
 'ambient' 'americana' 'animated' 'anime' 'arabesk' 'arabic' 'arena'
 'argentinetango' 'art' 'audiobook' 'avantgarde' 'axé' 'baile' 'balkan'
 'beats' 'bigroom' 'black' 'bluegrass' 'blues' 'bollywood' 'bossa'
 'brazilian' 'breakbeat' 'breaks' 'broadway' 'cantautori' 'cantopop'
 'canzone' 'caribbean' 'caucasian' 'celtic' 'chamber' 'children' 'chill'
 'chinese' 'choral' 'christian' 'christmas' 'classical' 'classicmetal'
 'club' 'colombian' 'comedy' 'conjazz' 'contemporary' 'country' 'cuban'
 'dance' 'dancehall' 'dancepop' 'dark' 'death' 'deep' 'deutschrock'
 'deutschspr' 'dirty' 'disco' 'dnb' 'documentary' 'downbeat' 'downtempo'
 'drum' 'dub' 'dubstep' 'eastern' 'easy' 'electronic' 'electropop' 'emo'
 'entehno' 'epicmetal' 'estrada' 'ethnic' 'eurofolk' 'european'
 'experimental' 'extrememetal' 'fado' 'film' 'fitness' 'flamenco' 'folk'
 'folklore' 'folkmetal' 'folkrock' 'folktronica' 'forró' 'frankreich'
 'französisch' 'french' 'funk' 'future' 'gangsta' 'garage' 'german'
 'ghazal' 'gitarre' 'glitch' 'gospel' 'gothic' 'grime' 'grunge' 'gypsy'
 'handsup' "hard'n'heavy" 'hardcore' 'hardstyle' 'hardtechno' 'hiphop'
 'historisch' 'holiday' 'horror' 'house' 'idm' 'independent' 'indian'
 'indie' 'indipop' 'industrial' 'inspirational' 'instrumental'
 'international' 'irish' 'jam' 'japanese' 'jazz' 'jewish' 'jpop' 'jungle'
 'k-pop' 'karadeniz' 'karaoke' 'kayokyoku' 'korean' 'laiko' 'latin'
 'latino' 'leftfield' 'local' 'lounge' 'loungeelectronic' 'lovers'
 'malaysian' 'mandopop' 'marschmusik' 'meditative' 'mediterranean'
 'melodic' 'metal' 'metalcore' 'mexican' 'middle' 'minimal'
 'miscellaneous' 'modern' 'mood' 'mpb' 'muslim' 'native' 'neoklassik'
 'neue' 'new' 'newage' 'newwave' 'nu' 'nujazz' 'numetal' 'oceania' 'old'
 'opera' 'orchestral' 'other' 'piano' 'pop' 'popelectronic' 'popeurodance'
 'post' 'posthardcore' 'postrock' 'power' 'progmetal' 'progressive'
 'psychedelic' 'punjabi' 'punk' 'quebecois' 'ragga' 'ram' 'rancheras'
 'rap' 'rave' 'reggae' 'reggaeton' 'regional' 'relax' 'religious' 'retro'
 'rhythm' 'rnb' 'rnr' 'rock' 'rockabilly' 'romance' 'roots' 'ruspop'
 'rusrap' 'rusrock' 'salsa' 'samba' 'schlager' 'self' 'sertanejo'
 'shoegazing' 'showtunes' 'singer' 'ska' 'slow' 'smooth' 'soul' 'soulful'
 'sound' 'soundtrack' 'southern' 'specialty' 'speech' 'spiritual' 'sport'
 'stonerrock' 'surf' 'swing' 'synthpop' 'sängerportrait' 'tango'
 'tanzorchester' 'taraftar' 'tech' 'techno' 'thrash' 'top' 'traditional'
 'tradjazz' 'trance' 'tribal' 'trip' 'triphop' 'tropical' 'türk' 'türkçe'
 'unknown' 'urban' 'uzbek' 'variété' 'vi' 'videogame' 'vocal' 'western'
 'world' 'worldbeat' 'ïîï']


# checking for implicit duplicates
df.duplicated().sum()

0


#visual of df grouped by city, ensure correct values
print(df.groupby('city')['track'].count())

city
Shelbyville    18512
Springfield    42741
Name: track, dtype: int64


# Counting up the tracks played in each city
tracks_by_city = df.groupby('city')['track'].count()
print(tracks_by_city)

city
Shelbyville    18512
Springfield    42741
Name: track, dtype: int64


#visual of df grouped by day, ensure correct values
print(df.groupby('day')['track'].count())

day
Friday       21840
Monday       21354
Wednesday    18059
Name: track, dtype: int64


# Calculating tracks played on each of the three days
tracks_by_day = df.groupby('day')['track'].count()
print(tracks_by_day)

day
Friday       21840
Monday       21354
Wednesday    18059
Name: track, dtype: int64


#group df by day and city
day_by_city = df.groupby('day')['city'].count() 
print(day_by_city)

day
Friday       21840
Monday       21354
Wednesday    18059
Name: city, dtype: int64


#group df by city and day
city_by_day = df.groupby('city')['day'].count() 
print(city_by_day)

city
Shelbyville    18512
Springfield    42741
Name: day, dtype: int64


# number of values in day and city columns
df.loc[:,['day', 'city']].count()

day     61253
city    61253
dtype: int64


# filtering day and city columns
df_day_and_city = df.loc[:,['day', 'city']]
print(df_day_and_city)

             day         city
32883     Friday  Springfield
3650      Monday  Shelbyville
20644  Wednesday  Springfield
39129     Monday  Springfield
14811     Friday  Shelbyville
...          ...          ...
6189      Friday  Springfield
18268  Wednesday  Springfield
28665     Monday  Springfield
40493  Wednesday  Shelbyville
8514      Friday  Springfield

[61253 rows x 2 columns]


# <creating the function number_tracks()>
def number_tracks(day, city) :
    track_list = df.loc[(df["day"] == day) & (df["city"] == city), :]
    track_list_count = len(track_list)
    return track_list_count
city = 'city'
day = 'day'
column_values = number_tracks(day, city)


# the number of songs played in Springfield on Monday
print('Monday: Springfield')
number_tracks('Monday', 'Springfield')

Monday: Springfield

15740


# the number of songs played in Shelbyville on Monday
print('Monday: Shelbyville')
number_tracks('Monday', 'Shelbyville')

Monday: Shelbyville

5614


# the number of songs played in Springfield on Wednesday
print('Wednesday: Springfield')
number_tracks('Wednesday', 'Springfield')

Wednesday: Springfield

11056


# the number of songs played in Shelbyville on Wednesday
print('Wednesday: Shelbyville')
number_tracks('Wednesday', 'Shelbyville')

Wednesday: Shelbyville

7003


# the number of songs played in Springfield on Friday
print('Friday: Springfield')
number_tracks('Friday', 'Springfield')

Friday: Springfield

15945


# the number of songs played in Shelbyville on Friday
print('Friday: Shelbyville')
number_tracks('Friday', 'Shelbyville')

Friday: Shelbyville

5895


# table with results
import numpy as np
d = [['Springfield', 15740, 11056, 15945], ['Shelbyville', 5614, 7003, 5895]]
df2 = pd.DataFrame(np.array(d), columns=['city', 'monday', 'wednesday', 'friday'])
df2


# create the spr_general table from the df rows, 
# where the value in the 'city' column is 'Springfield'
spr_general = df[df['city']=='Springfield']
spr_general


# create the shel_general from the df rows,
# where the value in the 'city' column is 'Shelbyville'
shel_general = df[df['city']=='Shelbyville']
shel_general


# Sorting Genres function
import pandas as pd

pd.to_datetime(arg=df['time'], format='%H:%M:%S')
def genre_weekday(data, day_of_week, first_time, second_time) :
    
    # consecutive filtering
    # Create the variable genre_df which will store only those df rows where the day is equal to day=
    genre_df = data[data['day'] == day_of_week]

    # filter again so that genre_df will store only those rows where the time is smaller than time2=
    genre_df = genre_df[genre_df['time']  <= second_time ] 

    # filter once more so that genre_df will store only rows where the time is greater than time1=
    genre_df = genre_df[genre_df['time']  >= first_time]

    # group the filtered DataFrame by the column with the names of genres, take the genre column, and find the number of rows for each genre with the count() method
    genre_df_count = genre_df.groupby('genre')['user_id'].count()

    # sort the result in descending order (so that the most popular genres come first in the Series object)
    genre_df_sorted = genre_df_count.sort_values(ascending=False)

    # we will return the Series object storing the 15 most popular genres on a given day in a given timeframe
    return genre_df_sorted[:15]


# calling the function for Monday morning in Springfield (use spr_general instead of the df table)
print('Monday morning Springfield')
genre_weekday(spr_general, 'Monday', '07:00:00', '11:00:00')

Monday morning Springfield

genre
pop            781
dance          549
electronic     480
rock           474
hiphop         286
ruspop         186
world          181
rusrap         175
alternative    164
unknown        161
classical      157
metal          120
jazz           100
folk            97
soundtrack      95
Name: user_id, dtype: int64


# calling the function for Monday morning in Shelbyville (use shel_general instead of the df table)
print('Monday morning Shelbyville')
genre_weekday(shel_general, 'Monday', '07:00', '11:00')

Monday morning Shelbyville

genre
pop            218
dance          182
rock           162
electronic     147
hiphop          80
ruspop          64
alternative     58
rusrap          55
jazz            44
classical       40
world           36
rap             32
soundtrack      31
rnb             27
metal           27
Name: user_id, dtype: int64


# calling the function for Friday evening in Springfield
print('Friday evening Springfield')
genre_weekday(spr_general, 'Friday', '17:00', '23:00')

Friday evening Springfield

genre
pop            713
rock           517
dance          495
electronic     482
hiphop         273
world          208
ruspop         170
classical      163
alternative    163
rusrap         142
jazz           111
unknown        110
soundtrack     105
rnb             90
metal           88
Name: user_id, dtype: int64


# calling the function for Friday evening in Shelbyville
print('Friday evening Shelbyvile')
genre_weekday(shel_general, 'Friday', '17:00', '23:00')

Friday evening Shelbyvile

genre
pop            256
rock           216
electronic     216
dance          210
hiphop          97
alternative     63
jazz            61
classical       60
rusrap          59
world           54
unknown         47
ruspop          47
soundtrack      40
metal           39
rap             36
Name: user_id, dtype: int64


# count of users listening to various genres
spr_genres = spr_general.groupby('genre')['user_id'].count().sort_values(ascending=False)


# printing the first 10 rows of spr_genres
print('Songs played for each genre : Springfield')
spr_genres.head(10)

Songs played for each genre : Springfield

genre
pop            5892
dance          4435
rock           3965
electronic     3786
hiphop         2096
classical      1616
world          1432
alternative    1379
ruspop         1372
rusrap         1161
Name: user_id, dtype: int64


# count of users listening to various genres
shel_genres = shel_general.groupby('genre')['user_id'].count().sort_values(ascending=False)


# printing the first 10 rows from shel_genres
print('Songs played for each genre : Shelbyville')
shel_genres.head(10)

Songs played for each genre : Shelbyville

genre
pop            2431
dance          1932
rock           1879
electronic     1736
hiphop          960
alternative     649
classical       646
rusrap          564
ruspop          538
world           515
Name: user_id, dtype: int64

Yandex.Music¶

Stage 1. Data overview ¶

Stage 2. Data preprocessing ¶

Missing values ¶

Duplicates ¶

Stage 3. Testing hypotheses ¶

Hypothesis 1: comparing user behavior in two cities ¶

Hypothesis 2: music at the beginning and end of the week ¶

Hypothesis 3: genre preferences in Springfield and Shelbyville ¶

	userID	Track	artist	genre	City	time	Day
0	FFB692EC	Kamigata To Boots	The Mass Missile	rock	Shelbyville	20:28:33	Wednesday
1	55204538	Delayed Because of Accident	Andreas Rönnberg	rock	Springfield	14:07:09	Friday
2	20EC38	Funiculì funiculà	Mario Lanza	pop	Shelbyville	20:58:07	Wednesday
3	A3DD03C9	Dragons in the Sunset	Fire + Ice	folk	Shelbyville	08:37:09	Monday
4	E2DC1FAE	Soul People	Space Echo	dance	Springfield	08:34:34	Monday

	userID	Track	artist	genre	City	time	Day
count	65079	63736	57512	63881	65079	65079	65079
unique	41748	39666	37806	268	2	20392	3
top	A8AE9169	Brand	Kartvelli	pop	Springfield	08:14:07	Friday
freq	76	136	136	8850	45360	14	23149

	user_id	track	artist	genre	city	time	day
65074	729CBB09	My Name	McLean	rnb	Springfield	13:32:28	Wednesday
65075	D08D4A55	Maybe One Day (feat. Black Spade)	Blu & Exile	hip	Shelbyville	10:00:00	Monday
65076	C5E3A0D5	Jalopiina	unknown	industrial	Springfield	20:09:26	Friday
65077	321D0506	Freight Train	Chas McDevitt	rock	Springfield	21:43:59	Friday
65078	3A64EF84	Tell Me Sweet Little Lies	Monica Lopez	country	Springfield	21:59:46	Friday

	user_id	track	artist	genre	city	time	day
61248	729CBB09	My Name	McLean	rnb	Springfield	13:32:28	Wednesday
61249	D08D4A55	Maybe One Day (feat. Black Spade)	Blu & Exile	hip	Shelbyville	10:00:00	Monday
61250	C5E3A0D5	Jalopiina	unknown	industrial	Springfield	20:09:26	Friday
61251	321D0506	Freight Train	Chas McDevitt	rock	Springfield	21:43:59	Friday
61252	3A64EF84	Tell Me Sweet Little Lies	Monica Lopez	country	Springfield	21:59:46	Friday

	user_id	track	artist	genre	city	time	day
32883	C40BC5DF	Shot in the Dark	Four80East	acid	Springfield	09:44:12	Friday
20644	B02ED3AB	Prelude – Ne Me Qui Te Pas	unknown	acoustic	Springfield	13:31:03	Wednesday
39129	77D3A7D9	Prelude – Ne Me Qui Te Pas	unknown	acoustic	Springfield	14:27:34	Monday
60870	23C64044	Don't Stop Me Now	Acoustic Heartstrings	acoustic	Springfield	13:30:01	Wednesday
27028	4D96EB78	Eventuality	Jack Trammell	action	Springfield	21:22:51	Monday
...	...	...	...	...	...	...	...
21799	77DBBA7A	Accordion Rhythm	Ahmed Nasr	world	Springfield	08:50:36	Friday
6189	12A10DD5	Ekatana	Shantavaani	world	Springfield	14:30:19	Friday
18268	653E0EF	Kalinka	The Red Army Choirs Of Alexandrov (Les Choeurs...	world	Springfield	21:08:36	Wednesday
28665	D7FB50DA	Drumming Circle	Professor Trance	worldbeat	Springfield	09:30:47	Monday
8514	A439123F	Flip It	unknown	ïîï	Springfield	09:08:51	Friday

	user_id	track	artist	genre	city	time	day
3650	F10919ED	Ready For The Fire	Valley Of Wolves	acoustic	Shelbyville	21:33:18	Monday
14811	1C7D9E90	Prelude – Ne Me Qui Te Pas	unknown	acoustic	Shelbyville	21:47:42	Friday
18559	897AAB4E	Betcha Never	Cherie	adult	Shelbyville	09:30:39	Friday
35110	4DCD818F	Blue Skies Again	Jessica Lea Mayfield	adult	Shelbyville	09:05:48	Wednesday
49611	826C2C51	Your Song	Ben Cocks	adult	Shelbyville	20:48:28	Friday
...	...	...	...	...	...	...	...
21601	6B1890C6	Unknown Dream	Farhad	world	Shelbyville	14:46:01	Wednesday
21666	F3D7493D	Hello peppinedda my girl josephine	Dany Danubio	world	Shelbyville	21:16:10	Monday
41840	DC347AA4	La Sitiera	Omara Portuondo	world	Shelbyville	21:32:36	Wednesday
50582	4DC32D69	Oath Sign	DJ Mocchi	world	Shelbyville	14:05:36	Friday
40493	AA1730E8	Anu	Kailash Kokopelli	worldbeat	Shelbyville	20:16:34	Wednesday