Analyzing Uber Data
%pylab inline
import pandas
import seaborn
Populating the interactive namespace from numpy and matplotlib
data = pandas.read_csv('Desktop/uber-raw-data-apr14.txt')
Explore your data
data.head()
Date/Time | Lat | Lon | Base | |
---|---|---|---|---|
0 | 4/1/2014 0:11:00 | 40.7690 | -73.9549 | B02512 |
1 | 4/1/2014 0:17:00 | 40.7267 | -74.0345 | B02512 |
2 | 4/1/2014 0:21:00 | 40.7316 | -73.9873 | B02512 |
3 | 4/1/2014 0:28:00 | 40.7588 | -73.9776 | B02512 |
4 | 4/1/2014 0:33:00 | 40.7594 | -73.9722 | B02512 |
Date and time are string, we need to conver them to integer:
dt = '4/1/2014 0:11:00'
dt
'4/1/2014 0:11:00'
d,t = dt.split(' ')
print(d)
print(t)
4/1/2014
0:11:00
m,d,y=d.split('/')
d
'1'
int(d)
1
Another way to do it:
dt=pandas.to_datetime(dt)
dt
Timestamp('2014-04-01 00:11:00')
here we have nice functions to play with
dt.weekday_name
C:\Users\hamad\AppData\Local\Continuum\anaconda3\lib\site-packages\ipykernel_launcher.py:1: FutureWarning: `weekday_name` is deprecated and will be removed in a future version. Use `day_name` instead
"""Entry point for launching an IPython kernel.
'Tuesday'
dt.weekday()
1
Now we convert the whole column:
data['Date/Time']= data['Date/Time'].map(pandas.to_datetime)
data.head()
Date/Time | Lat | Lon | Base | |
---|---|---|---|---|
0 | 2014-04-01 00:11:00 | 40.7690 | -73.9549 | B02512 |
1 | 2014-04-01 00:17:00 | 40.7267 | -74.0345 | B02512 |
2 | 2014-04-01 00:21:00 | 40.7316 | -73.9873 | B02512 |
3 | 2014-04-01 00:28:00 | 40.7588 | -73.9776 | B02512 |
4 | 2014-04-01 00:33:00 | 40.7594 | -73.9722 | B02512 |
data['Date/Time'][0]
Timestamp('2014-04-01 00:11:00')
Let us create a function to return back date of the month
def getdom(dt):
return dt.day
data['dom'] = data['Date/Time'].map(getdom)
data.tail()
Date/Time | Lat | Lon | Base | dom | |
---|---|---|---|---|---|
564511 | 2014-04-30 23:22:00 | 40.7640 | -73.9744 | B02764 | 30 |
564512 | 2014-04-30 23:26:00 | 40.7629 | -73.9672 | B02764 | 30 |
564513 | 2014-04-30 23:31:00 | 40.7443 | -73.9889 | B02764 | 30 |
564514 | 2014-04-30 23:32:00 | 40.6756 | -73.9405 | B02764 | 30 |
564515 | 2014-04-30 23:48:00 | 40.6880 | -73.9608 | B02764 | 30 |
data.head()
Date/Time | Lat | Lon | Base | dom | |
---|---|---|---|---|---|
0 | 2014-04-01 00:11:00 | 40.7690 | -73.9549 | B02512 | 1 |
1 | 2014-04-01 00:17:00 | 40.7267 | -74.0345 | B02512 | 1 |
2 | 2014-04-01 00:21:00 | 40.7316 | -73.9873 | B02512 | 1 |
3 | 2014-04-01 00:28:00 | 40.7588 | -73.9776 | B02512 | 1 |
4 | 2014-04-01 00:33:00 | 40.7594 | -73.9722 | B02512 | 1 |
Create another function for the weekday
def getwkday(dt):
return dt.weekday()
data['weekday']= data['Date/Time'].map(getwkday)
Create a function to get the hour of the day
def gethr(dt):
return dt.hour
data['hour']=data['Date/Time'].map(gethr)
data.head()
Date/Time | Lat | Lon | Base | dom | weekday | hour | |
---|---|---|---|---|---|---|---|
0 | 2014-04-01 00:11:00 | 40.7690 | -73.9549 | B02512 | 1 | 1 | 0 |
1 | 2014-04-01 00:17:00 | 40.7267 | -74.0345 | B02512 | 1 | 1 | 0 |
2 | 2014-04-01 00:21:00 | 40.7316 | -73.9873 | B02512 | 1 | 1 | 0 |
3 | 2014-04-01 00:28:00 | 40.7588 | -73.9776 | B02512 | 1 | 1 | 0 |
4 | 2014-04-01 00:33:00 | 40.7594 | -73.9722 | B02512 | 1 | 1 | 0 |
data.tail()
Date/Time | Lat | Lon | Base | dom | weekday | hour | |
---|---|---|---|---|---|---|---|
564511 | 2014-04-30 23:22:00 | 40.7640 | -73.9744 | B02764 | 30 | 2 | 23 |
564512 | 2014-04-30 23:26:00 | 40.7629 | -73.9672 | B02764 | 30 | 2 | 23 |
564513 | 2014-04-30 23:31:00 | 40.7443 | -73.9889 | B02764 | 30 | 2 | 23 |
564514 | 2014-04-30 23:32:00 | 40.6756 | -73.9405 | B02764 | 30 | 2 | 23 |
564515 | 2014-04-30 23:48:00 | 40.6880 | -73.9608 | B02764 | 30 | 2 | 23 |
Analysis:
Analyze the date of the month:
hist(data.dom)
(array([52721., 59680., 52581., 58631., 45427., 56764., 38781., 60673.,
64697., 74561.]),
array([ 1. , 3.9, 6.8, 9.7, 12.6, 15.5, 18.4, 21.3, 24.2, 27.1, 30. ]),
<a list of 10 Patch objects>)
hist(data.dom, bins = 30 )
(array([14546., 17474., 20701., 26714., 19521., 13445., 19550., 16188.,
16843., 20041., 20420., 18170., 12112., 12674., 20641., 17717.,
20973., 18074., 14602., 11017., 13162., 16975., 20346., 23352.,
25095., 24925., 14677., 15475., 22835., 36251.]),
array([ 1. , 1.96666667, 2.93333333, 3.9 , 4.86666667,
5.83333333, 6.8 , 7.76666667, 8.73333333, 9.7 ,
10.66666667, 11.63333333, 12.6 , 13.56666667, 14.53333333,
15.5 , 16.46666667, 17.43333333, 18.4 , 19.36666667,
20.33333333, 21.3 , 22.26666667, 23.23333333, 24.2 ,
25.16666667, 26.13333333, 27.1 , 28.06666667, 29.03333333,
30. ]),
<a list of 30 Patch objects>)
hist(data.dom, bins = 30, rwidth=0.8)
(array([14546., 17474., 20701., 26714., 19521., 13445., 19550., 16188.,
16843., 20041., 20420., 18170., 12112., 12674., 20641., 17717.,
20973., 18074., 14602., 11017., 13162., 16975., 20346., 23352.,
25095., 24925., 14677., 15475., 22835., 36251.]),
array([ 1. , 1.96666667, 2.93333333, 3.9 , 4.86666667,
5.83333333, 6.8 , 7.76666667, 8.73333333, 9.7 ,
10.66666667, 11.63333333, 12.6 , 13.56666667, 14.53333333,
15.5 , 16.46666667, 17.43333333, 18.4 , 19.36666667,
20.33333333, 21.3 , 22.26666667, 23.23333333, 24.2 ,
25.16666667, 26.13333333, 27.1 , 28.06666667, 29.03333333,
30. ]),
<a list of 30 Patch objects>)
hist(data.dom, bins= 30, rwidth=0.8, range=(0.5, 30.5))
xlabel('Date of the Month')
ylabel('Frequency')
title('Frquency by Date of Month - Uber - April 2014')
Text(0.5,1,'Frquency by Date of Month - Uber - April 2014')
for i, rows in data.groupby('dom'):
print((i,len(rows)))
(1, 14546)
(2, 17474)
(3, 20701)
(4, 26714)
(5, 19521)
(6, 13445)
(7, 19550)
(8, 16188)
(9, 16843)
(10, 20041)
(11, 20420)
(12, 18170)
(13, 12112)
(14, 12674)
(15, 20641)
(16, 17717)
(17, 20973)
(18, 18074)
(19, 14602)
(20, 11017)
(21, 13162)
(22, 16975)
(23, 20346)
(24, 23352)
(25, 25095)
(26, 24925)
(27, 14677)
(28, 15475)
(29, 22835)
(30, 36251)
Another way of doing it
def count_rows(rows):
return len(rows)
by_date = data.groupby('dom').apply(count_rows)
by_date
dom
1 14546
2 17474
3 20701
4 26714
5 19521
6 13445
7 19550
8 16188
9 16843
10 20041
11 20420
12 18170
13 12112
14 12674
15 20641
16 17717
17 20973
18 18074
19 14602
20 11017
21 13162
22 16975
23 20346
24 23352
25 25095
26 24925
27 14677
28 15475
29 22835
30 36251
dtype: int64
plot(by_date)
[<matplotlib.lines.Line2D at 0x1e85e54eac8>]
let sort the data by date of the month and frequency of trips
by_date_sorted = by_date.sort_values()
by_date_sorted
dom
20 11017
13 12112
14 12674
21 13162
6 13445
1 14546
19 14602
27 14677
28 15475
8 16188
9 16843
22 16975
2 17474
16 17717
18 18074
12 18170
5 19521
7 19550
10 20041
23 20346
11 20420
15 20641
3 20701
17 20973
29 22835
24 23352
26 24925
25 25095
4 26714
30 36251
dtype: int64
bar(range(1,31), by_date_sorted)
xticks(range(1,31),by_date_sorted.index)
xlabel('Date of the Month')
ylabel('Frequency')
title('Frquency by Date of Month - Uber - April 2014')
;
''
Analysis of hours:
hist(data.hour, bins=24, range=(0.5,24))
(array([ 7769., 4935., 5040., 6095., 9476., 18498., 24924., 22843.,
17939., 17865., 18774., 19425., 22603., 27190., 35324., 42003.,
45475., 43003., 38923., 36244., 36964., 30645., 20649., 0.]),
array([ 0.5 , 1.47916667, 2.45833333, 3.4375 , 4.41666667,
5.39583333, 6.375 , 7.35416667, 8.33333333, 9.3125 ,
10.29166667, 11.27083333, 12.25 , 13.22916667, 14.20833333,
15.1875 , 16.16666667, 17.14583333, 18.125 , 19.10416667,
20.08333333, 21.0625 , 22.04166667, 23.02083333, 24. ]),
<a list of 24 Patch objects>)
Analysis of weekday:
hist(data.weekday, bins = 7, range=(-0.5,6.5), rwidth= 0.8, color= 'green')
xticks(range(7), 'Mon Tue Wed Thurs Fri Sat Sun'.split())
([<matplotlib.axis.XTick at 0x1e85e996630>,
<matplotlib.axis.XTick at 0x1e85e9a8e48>,
<matplotlib.axis.XTick at 0x1e85e9a8860>,
<matplotlib.axis.XTick at 0x1e85e923160>,
<matplotlib.axis.XTick at 0x1e85e923588>,
<matplotlib.axis.XTick at 0x1e85e923a58>,
<matplotlib.axis.XTick at 0x1e85e923f28>],
<a list of 7 Text xticklabel objects>)
Analyzing hours and day of the week
count_rows(data)
564516
data.groupby('hour weekday'.split()).apply(count_rows)
hour weekday
0 0 518
1 765
2 899
3 792
4 1367
5 3027
6 4542
1 0 261
1 367
2 507
3 459
4 760
5 2479
6 2936
2 0 238
1 304
2 371
3 342
4 513
5 1577
6 1590
3 0 571
1 516
2 585
3 567
4 736
5 1013
6 1052
4 0 1021
1 887
...
19 5 5529
6 2579
20 0 3573
1 6310
2 7783
3 6345
4 5165
5 4792
6 2276
21 0 3079
1 5993
2 6921
3 6585
4 6265
5 5811
6 2310
22 0 1976
1 3614
2 4845
3 5370
4 6708
5 6493
6 1639
23 0 1091
1 1948
2 2571
3 2909
4 5393
5 5719
6 1018
Length: 168, dtype: int64
data.groupby('hour weekday'.split()).apply(count_rows).unstack()
weekday | 0 | 1 | 2 | 3 | 4 | 5 | 6 |
---|---|---|---|---|---|---|---|
hour | |||||||
0 | 518 | 765 | 899 | 792 | 1367 | 3027 | 4542 |
1 | 261 | 367 | 507 | 459 | 760 | 2479 | 2936 |
2 | 238 | 304 | 371 | 342 | 513 | 1577 | 1590 |
3 | 571 | 516 | 585 | 567 | 736 | 1013 | 1052 |
4 | 1021 | 887 | 1003 | 861 | 932 | 706 | 685 |
5 | 1619 | 1734 | 1990 | 1454 | 1382 | 704 | 593 |
6 | 2974 | 3766 | 4230 | 3179 | 2836 | 844 | 669 |
7 | 3888 | 5304 | 5647 | 4159 | 3943 | 1110 | 873 |
8 | 3138 | 4594 | 5242 | 3616 | 3648 | 1372 | 1233 |
9 | 2211 | 2962 | 3846 | 2654 | 2732 | 1764 | 1770 |
10 | 1953 | 2900 | 3844 | 2370 | 2599 | 2086 | 2113 |
11 | 1929 | 2949 | 3889 | 2516 | 2816 | 2315 | 2360 |
12 | 1945 | 2819 | 3988 | 2657 | 2978 | 2560 | 2478 |
13 | 2294 | 3556 | 4469 | 3301 | 3535 | 2685 | 2763 |
14 | 3117 | 4489 | 5438 | 4083 | 4087 | 3042 | 2934 |
15 | 3818 | 6042 | 7071 | 5182 | 5354 | 4457 | 3400 |
16 | 4962 | 7521 | 8213 | 6149 | 6259 | 5410 | 3489 |
17 | 5574 | 8297 | 9151 | 6951 | 6790 | 5558 | 3154 |
18 | 4725 | 7089 | 8334 | 6637 | 7258 | 6165 | 2795 |
19 | 4386 | 6459 | 7794 | 5929 | 6247 | 5529 | 2579 |
20 | 3573 | 6310 | 7783 | 6345 | 5165 | 4792 | 2276 |
21 | 3079 | 5993 | 6921 | 6585 | 6265 | 5811 | 2310 |
22 | 1976 | 3614 | 4845 | 5370 | 6708 | 6493 | 1639 |
23 | 1091 | 1948 | 2571 | 2909 | 5393 | 5719 | 1018 |
by_cross1 = data.groupby('hour weekday'.split()).apply(count_rows).unstack()
seaborn.heatmap(by_cross1)
<matplotlib.axes._subplots.AxesSubplot at 0x1e85dcdb9b0>
data.groupby('weekday hour'.split()).apply(count_rows).unstack()
hour | 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | ... | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
weekday | |||||||||||||||||||||
0 | 518 | 261 | 238 | 571 | 1021 | 1619 | 2974 | 3888 | 3138 | 2211 | ... | 3117 | 3818 | 4962 | 5574 | 4725 | 4386 | 3573 | 3079 | 1976 | 1091 |
1 | 765 | 367 | 304 | 516 | 887 | 1734 | 3766 | 5304 | 4594 | 2962 | ... | 4489 | 6042 | 7521 | 8297 | 7089 | 6459 | 6310 | 5993 | 3614 | 1948 |
2 | 899 | 507 | 371 | 585 | 1003 | 1990 | 4230 | 5647 | 5242 | 3846 | ... | 5438 | 7071 | 8213 | 9151 | 8334 | 7794 | 7783 | 6921 | 4845 | 2571 |
3 | 792 | 459 | 342 | 567 | 861 | 1454 | 3179 | 4159 | 3616 | 2654 | ... | 4083 | 5182 | 6149 | 6951 | 6637 | 5929 | 6345 | 6585 | 5370 | 2909 |
4 | 1367 | 760 | 513 | 736 | 932 | 1382 | 2836 | 3943 | 3648 | 2732 | ... | 4087 | 5354 | 6259 | 6790 | 7258 | 6247 | 5165 | 6265 | 6708 | 5393 |
5 | 3027 | 2479 | 1577 | 1013 | 706 | 704 | 844 | 1110 | 1372 | 1764 | ... | 3042 | 4457 | 5410 | 5558 | 6165 | 5529 | 4792 | 5811 | 6493 | 5719 |
6 | 4542 | 2936 | 1590 | 1052 | 685 | 593 | 669 | 873 | 1233 | 1770 | ... | 2934 | 3400 | 3489 | 3154 | 2795 | 2579 | 2276 | 2310 | 1639 | 1018 |
7 rows × 24 columns
by_cross = data.groupby('weekday hour'.split()).apply(count_rows).unstack()
seaborn.heatmap(by_cross)
<matplotlib.axes._subplots.AxesSubplot at 0x1e85ddb2be0>
seaborn.clustermap(by_cross)
<seaborn.matrix.ClusterGrid at 0x1e85dc35710>
Analysis of lat and long:
hist(data['Lat'], bins= 100, range =(40.5,41))
;
''
hist(data['Lon'], bins = 100, range=(-74.5, -73.5))
;
''
we can combine both of them
hist(data['Lat'], bins= 100, range =(40.5,41), color='r')
twiny()
hist(data['Lon'], bins = 100, range=(-74.5, -73.5), color ='g')
;
''
hist(data['Lon'], bins = 100, range=(-74.5, -73.5), color ='g', alpha=0.5 )
twiny()
hist(data['Lat'], bins= 100, range =(40.5,41), color='r', alpha = 0.5)
;
''
hist(data['Lon'], bins = 100, range=(-74.5, -73.5), color ='g', alpha=0.5, label="Longitudinal" )
grid()
legend(loc= 'upper right')
twiny()
hist(data['Lat'], bins= 100, range =(40.5,41), color='r', alpha = 0.5, label='Latitude')
grid()
legend(loc = 'upper left')
;
''
plot(data['Lat'])
[<matplotlib.lines.Line2D at 0x1e85cbdabe0>]
plot(data['Lat'])
xlim(0,100)
(0, 100)
plot(data['Lat'], '.')
xlim(0,100)
(0, 100)
plot(data['Lon'], data['Lat'])
[<matplotlib.lines.Line2D at 0x1e86149eac8>]
plot(data['Lon'], data['Lat'],'.')
[<matplotlib.lines.Line2D at 0x1e86119d550>]
plot(data['Lon'], data['Lat'],'.', ms=1)
[<matplotlib.lines.Line2D at 0x1e8614bdb38>]
plot(data['Lon'], data['Lat'],'.', ms=1, alpha= 0.5)
[<matplotlib.lines.Line2D at 0x1e85c717748>]
plot(data['Lon'], data['Lat'],'.', ms=1, alpha= 0.5)
xlim(-74.1,-73.4)
(-74.1, -73.4)
plot(data['Lon'], data['Lat'],'.', ms=1, alpha= 0.5)
xlim(-74.1,-73.4)
ylim(40.3,41.2)
(40.3, 41.2)
figure(figsize=(20,20))
plot(data['Lon'], data['Lat'],'.', ms=0.1, alpha= 0.5)
xlim(-74.2,-73.7)
ylim(40.5,41)
(40.5, 41)
figure(figsize=(20,20))
plot(data['Lon'], data['Lat'],'.', ms=1, alpha= 0.5)
xlim(-74.05,-73.80)
ylim(40.65,40.80)
(40.65, 40.8)