# import libraries
import requests
from bs4 import BeautifulSoup
import pandas as pd 
import plotly.express as px
import plotly.graph_objects as go
from scipy import stats as st
import numpy as np


# web scrape data
URL = 'https://code.s3.yandex.net/data-analyst-eng/chicago_weather_2017.html'
req = requests.get(URL)
soup = BeautifulSoup(req.text, 'lxml')

table = soup.find('table', attrs={"id": "weather_records"})
heading_table=[]
for row in table.find_all('th'):
    heading_table.append(row.text)
    
content=[]
for row in table.find_all('tr'):
    if not row.find_all('th'):
        content.append([element.text for element in row.find_all('td')])
weather_records = pd.DataFrame(content, columns = heading_table)
print(weather_records)

           Date and time Temperature       Description
0    2017-11-01 00:00:00     276.150     broken clouds
1    2017-11-01 01:00:00     275.700  scattered clouds
2    2017-11-01 02:00:00     275.610   overcast clouds
3    2017-11-01 03:00:00     275.350     broken clouds
4    2017-11-01 04:00:00     275.240     broken clouds
..                   ...         ...               ...
692  2017-11-29 20:00:00     281.340        few clouds
693  2017-11-29 21:00:00     281.690      sky is clear
694  2017-11-29 22:00:00     281.070        few clouds
695  2017-11-29 23:00:00     280.060      sky is clear
696  2017-11-30 00:00:00     278.460      sky is clear

[697 rows x 3 columns]


# read dataframes
df_1 = pd.read_csv('datasets/project_sql_result_01.csv')
df_2 = pd.read_csv('datasets/project_sql_result_04.csv')
df_3 = pd.read_csv('datasets/project_sql_result_07.csv')


# Overview of data
df_1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 64 entries, 0 to 63
Data columns (total 2 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   company_name  64 non-null     object
 1   trips_amount  64 non-null     int64 
dtypes: int64(1), object(1)
memory usage: 1.1+ KB


# Looking for missing values
df_1.isna().sum()

company_name    0
trips_amount    0
dtype: int64


# Looking for duplicates
df_1.duplicated().sum()

0


# Looking at different company names
df_1['company_name'].unique()

array(['Flash Cab', 'Taxi Affiliation Services', 'Medallion Leasin',
       'Yellow Cab', 'Taxi Affiliation Service Yellow',
       'Chicago Carriage Cab Corp', 'City Service', 'Sun Taxi',
       'Star North Management LLC', 'Blue Ribbon Taxi Association Inc.',
       'Choice Taxi Association', 'Globe Taxi',
       'Dispatch Taxi Affiliation', 'Nova Taxi Affiliation Llc',
       'Patriot Taxi Dba Peace Taxi Associat', 'Checker Taxi Affiliation',
       'Blue Diamond', 'Chicago Medallion Management', '24 Seven Taxi',
       'Chicago Medallion Leasing INC', 'Checker Taxi', 'American United',
       'Chicago Independents', 'KOAM Taxi Association', 'Chicago Taxicab',
       'Top Cab Affiliation', 'Gold Coast Taxi',
       'Service Taxi Association', '5 Star Taxi', '303 Taxi',
       'Setare Inc', 'American United Taxi Affiliation', 'Leonard Cab Co',
       'Metro Jet Taxi A', 'Norshore Cab', '6742 - 83735 Tasha ride inc',
       '3591 - 63480 Chuks Cab', '1469 - 64126 Omar Jada',
       '6743 - 78771 Luhak Corp', '0118 - 42111 Godfrey S.Awir',
       '6574 - Babylon Express Inc.', 'Chicago Star Taxicab',
       '1085 - 72312 N and W Cab Co', '2809 - 95474 C & D Cab Co Inc.',
       '2092 - 61288 Sbeih company', '3011 - 66308 JBL Cab Inc.',
       '3620 - 52292 David K. Cab Corp.', '4615 - 83503 Tyrone Henderson',
       '3623 - 72222 Arrington Enterprises', '5074 - 54002 Ahzmi Inc',
       '2823 - 73307 Lee Express Inc', '4623 - 27290 Jay Kim',
       '3721 - Santamaria Express, Alvaro Santamaria',
       '5006 - 39261 Salifu Bawa', '2192 - 73487 Zeymane Corp',
       '6057 - 24657 Richard Addo', '5997 - 65283 AW Services Inc.',
       'Metro Group', '5062 - 34841 Sam Mestas',
       '4053 - 40193 Adwar H. Nikola', '2733 - 74600 Benny Jona',
       '5874 - 73628 Sergey Cab Corp.',
       '2241 - 44667 - Felman Corp, Manuel Alonso',
       '3556 - 36214 RC Andrews Cab'], dtype=object)


# Top 20 comapnies by trips amount
df_1.sort_values(by='trips_amount', ascending=False).head(20)


# Overview of data
df_2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 94 entries, 0 to 93
Data columns (total 2 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   dropoff_location_name  94 non-null     object 
 1   average_trips          94 non-null     float64
dtypes: float64(1), object(1)
memory usage: 1.6+ KB


# Looking for missing values
df_2.isna().sum()

dropoff_location_name    0
average_trips            0
dtype: int64


# looking for duplicates
df_2.duplicated().sum()

0


# looking at different drop off locations
df_2['dropoff_location_name'].unique()

array(['Loop', 'River North', 'Streeterville', 'West Loop', "O'Hare",
       'Lake View', 'Grant Park', 'Museum Campus', 'Gold Coast',
       'Sheffield & DePaul', 'Lincoln Park', 'East Village',
       'Little Italy, UIC', 'Uptown', 'Near South Side', 'Garfield Ridge',
       'Logan Square', 'Edgewater', 'West Town', 'Old Town',
       'Rush & Division', 'North Center', 'Lincoln Square', 'Rogers Park',
       'West Ridge', 'Irving Park', 'Hyde Park', 'Avondale',
       'Wicker Park', 'Albany Park', 'United Center', 'Lower West Side',
       'Douglas', 'Portage Park', 'Humboldt Park', 'Norwood Park',
       'Kenwood', 'Bridgeport', 'Armour Square', 'Jefferson Park',
       'Bucktown', 'North Park', 'Garfield Park', 'Mckinley Park',
       'Belmont Cragin', 'Boystown', 'Chinatown', 'Grand Boulevard',
       'Austin', 'Sauganash,Forest Glen', 'South Shore', 'Woodlawn',
       'Little Village', 'Jackson Park', 'North Lawndale', 'Dunning',
       'Ukrainian Village', 'Hermosa', 'Englewood', 'Chatham', 'New City',
       'Grand Crossing', 'Brighton Park', 'Andersonville', 'Oakland',
       'Ashburn', 'Washington Park', 'Auburn Gresham', 'Clearing',
       'Roseland', 'Wrigleyville', 'Edison Park', 'South Chicago',
       'Chicago Lawn', 'Beverly', 'Calumet Heights', 'Washington Heights',
       'Archer Heights', 'Gage Park', 'Fuller Park', 'West Lawn',
       'Morgan Park', 'South Deering', 'Avalon Park', 'West Pullman',
       'Montclare', 'West Elsdon', 'Pullman', 'Undef', 'Mount Greenwood',
       'Hegewisch', 'Burnside', 'East Side', 'Riverdale'], dtype=object)


# Change start_ts to datetime format
df_3['start_ts'] = pd.to_datetime(df_3['start_ts'])


# looking at types of start, and other columns
df_3.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1068 entries, 0 to 1067
Data columns (total 3 columns):
 #   Column              Non-Null Count  Dtype         
---  ------              --------------  -----         
 0   start_ts            1068 non-null   datetime64[ns]
 1   weather_conditions  1068 non-null   object        
 2   duration_seconds    1068 non-null   float64       
dtypes: datetime64[ns](1), float64(1), object(1)
memory usage: 25.2+ KB


# looking at unique weather conditions
df_3['weather_conditions'].unique()

array(['Good', 'Bad'], dtype=object)


# Feature engineering 
df_3['day'] = df_3['start_ts'].dt.day
df_3['hour'] = df_3['start_ts'].dt.hour
df_3['duration_seconds'] = df_3['duration_seconds']/60
df_3.rename(columns={'duration_seconds': 'duration_minutes'}, inplace=True)


# Top 10 Taxi Companies
px.bar(df_1.head(10), x='company_name', y='trips_amount', title='Top 10 Taxi Companies', height=900)


# Top 10 Taxi Companies share
px.pie(df_1.head(10), names='company_name', values='trips_amount', height=900,  title='Top 10 Taxi Companies')


# Top 10 Drop Off Locations
px.bar(df_2.head(10), x='dropoff_location_name', y='average_trips', title='Top 10 Drop Off Locations', height=900)


# Top 10 Dropoff Locations share
px.pie(df_2.head(10), names='dropoff_location_name', values='average_trips', title='Top 10 Dropoff Locations', height=900)


# summary statistics on ride duration
df_3['duration_minutes'].describe()

count    1068.000000
mean       34.528855
std        12.824352
min         0.000000
25%        23.970833
50%        33.000000
75%        43.000000
max       124.000000
Name: duration_minutes, dtype: float64


# Correlation of duration with day and hour
df_3.corr()


# Extracting weather conditions
weather_conditions = df_3['weather_conditions'].value_counts()


# Getting hourly weather conditions
hourly_weather = df_3.groupby('weather_conditions')['hour'].value_counts().reset_index(level=0)
hourly_weather = hourly_weather.rename(columns={'hour': 'count'})


# getting daily weather conditions
dayly_weather = df_3.groupby('weather_conditions')['day'].value_counts().reset_index(level=0)
dayly_weather = dayly_weather.rename(columns={'day': 'count'})


# Ride duration based on weather
duration_weather = df_3.groupby('weather_conditions')['duration_minutes'].value_counts().reset_index(level=0)
duration_weather = duration_weather.rename(columns={'duration_minutes': 'count'})
duration_weather = duration_weather.reset_index()


# Weather Conditions
px.bar(weather_conditions, title='Weather Conditions', color_discrete_sequence=[['blue', 'red']])


# weather conditions by the hour
px.bar(hourly_weather, y='count',  title='Weather Conditions', color='weather_conditions', height=700 )


# Weather Conditions by Day
px.bar(dayly_weather, y='count', title='Weather Conditions by Day', color='weather_conditions', height=700 )


# Ride Durations
px.scatter(duration_weather, x='duration_minutes', y='count', color='weather_conditions', title='Ride Durations', height=700)


# List of ride durations on good days
good = df_3[df_3['weather_conditions'] == 'Good']
good = good['duration_minutes'].reset_index(drop=True)
good_list = good.to_list()


# Summary statistics of good days
good.describe()

count    888.000000
mean      33.327928
std       12.653304
min        0.000000
25%       23.162500
50%       30.000000
75%       41.000000
max      124.000000
Name: duration_minutes, dtype: float64


# List of ride durations on bad days
bad = df_3[df_3['weather_conditions'] == 'Bad']
bad = bad['duration_minutes'].reset_index(drop=True)
bad_list = bad.to_list()


# Summary statistics of bad days
bad.describe()

count    180.000000
mean      40.453426
std       12.021902
min        8.000000
25%       32.700000
50%       42.333333
75%       48.800000
max       83.000000
Name: duration_minutes, dtype: float64


alpha = 0.05  # critical statistical significance level
# if the p-value is less than alpha, we reject the hypothesis

results = st.ttest_ind(good_list, bad_list)

print('p-value: ', results.pvalue)

if results.pvalue < alpha:
    print("We reject the null hypothesis, the average duration of rides from the Loop to O'hare International Airport are different")
else:
    print("We can't reject the null hypothesis, the average duration of rides from the Loop to O'hare International Airport are the same")

p-value:  6.517970327099781e-12
We reject the null hypothesis, the average duration of rides from the Loop to O'hare International Airport are different

Project 6 - Zuber ¶

Description ¶

Hypothesis ¶

Import Libraries ¶

Read Data Files ¶

Cleaning Data ¶

Df 1 ¶

Df 2 ¶

Df 3 ¶

Exploratory Data Analysis ¶

Results ¶

Results ¶

Results ¶

Hypothesis Testing ¶

Null Hypothesis ¶

The average duration of rides from the Loop to O'hare International Airport are the same ¶

Conclusions ¶

	company_name	trips_amount
0	Flash Cab	19558
1	Taxi Affiliation Services	11422
2	Medallion Leasin	10367
3	Yellow Cab	9888
4	Taxi Affiliation Service Yellow	9299
5	Chicago Carriage Cab Corp	9181
6	City Service	8448
7	Sun Taxi	7701
8	Star North Management LLC	7455
9	Blue Ribbon Taxi Association Inc.	5953
10	Choice Taxi Association	5015
11	Globe Taxi	4383
12	Dispatch Taxi Affiliation	3355
13	Nova Taxi Affiliation Llc	3175
14	Patriot Taxi Dba Peace Taxi Associat	2235
15	Checker Taxi Affiliation	2216
16	Blue Diamond	2070
17	Chicago Medallion Management	1955
18	24 Seven Taxi	1775
19	Chicago Medallion Leasing INC	1607

	duration_minutes	day	hour
duration_minutes	1.000000	-0.159101	0.588227
day	-0.159101	1.000000	-0.058946
hour	0.588227	-0.058946	1.000000