Analyzing Uber Data
%pylab inline
import pandas
import seaborn
data = pandas.read_csv('Desktop/uber-raw-data-apr14.txt')
Explore your data
Date/Time | Lat | Lon | Base | |
0 | 4/1/2014 0:11:00 | 40.7690 | -73.9549 | B02512 |
1 | 4/1/2014 0:17:00 | 40.7267 | -74.0345 | B02512 |
2 | 4/1/2014 0:21:00 | 40.7316 | -73.9873 | B02512 |
3 | 4/1/2014 0:28:00 | 40.7588 | -73.9776 | B02512 |
4 | 4/1/2014 0:33:00 | 40.7594 | -73.9722 | B02512 |
Date and time are string, we need to conver them to integer:
dt = '4/1/2014 0:11:00'
'4/1/2014 0:11:00'
d,t = dt.split(' ')
Another way to do it:
Timestamp('2014-04-01 00:11:00')
here we have nice functions to play with
Now we convert the whole column:
data['Date/Time']= data['Date/Time'].map(pandas.to_datetime)
Date/Time | Lat | Lon | Base | |
0 | 2014-04-01 00:11:00 | 40.7690 | -73.9549 | B02512 |
1 | 2014-04-01 00:17:00 | 40.7267 | -74.0345 | B02512 |
2 | 2014-04-01 00:21:00 | 40.7316 | -73.9873 | B02512 |
3 | 2014-04-01 00:28:00 | 40.7588 | -73.9776 | B02512 |
4 | 2014-04-01 00:33:00 | 40.7594 | -73.9722 | B02512 |
Timestamp('2014-04-01 00:11:00')
Let us create a function to return back date of the month
def getdom(dt):
data['dom'] = data['Date/Time'].map(getdom)
Date/Time | Lat | Lon | Base | dom | |
564511 | 2014-04-30 23:22:00 | 40.7640 | -73.9744 | B02764 | 30 |
564512 | 2014-04-30 23:26:00 | 40.7629 | -73.9672 | B02764 | 30 |
564513 | 2014-04-30 23:31:00 | 40.7443 | -73.9889 | B02764 | 30 |
564514 | 2014-04-30 23:32:00 | 40.6756 | -73.9405 | B02764 | 30 |
564515 | 2014-04-30 23:48:00 | 40.6880 | -73.9608 | B02764 | 30 |
Date/Time | Lat | Lon | Base | dom | |
0 | 2014-04-01 00:11:00 | 40.7690 | -73.9549 | B02512 | 1 |
1 | 2014-04-01 00:17:00 | 40.7267 | -74.0345 | B02512 | 1 |
2 | 2014-04-01 00:21:00 | 40.7316 | -73.9873 | B02512 | 1 |
3 | 2014-04-01 00:28:00 | 40.7588 | -73.9776 | B02512 | 1 |
4 | 2014-04-01 00:33:00 | 40.7594 | -73.9722 | B02512 | 1 |
Create another function for the weekday
def getwkday(dt):
return dt.weekday()
data['weekday']= data['Date/Time'].map(getwkday)
Create a function to get the hour of the day
def gethr(dt):
return dt.hour
Date/Time | Lat | Lon | Base | dom | weekday | hour | |
0 | 2014-04-01 00:11:00 | 40.7690 | -73.9549 | B02512 | 1 | 1 | 0 |
1 | 2014-04-01 00:17:00 | 40.7267 | -74.0345 | B02512 | 1 | 1 | 0 |
2 | 2014-04-01 00:21:00 | 40.7316 | -73.9873 | B02512 | 1 | 1 | 0 |
3 | 2014-04-01 00:28:00 | 40.7588 | -73.9776 | B02512 | 1 | 1 | 0 |
4 | 2014-04-01 00:33:00 | 40.7594 | -73.9722 | B02512 | 1 | 1 | 0 |
Date/Time | Lat | Lon | Base | dom | weekday | hour | |
564511 | 2014-04-30 23:22:00 | 40.7640 | -73.9744 | B02764 | 30 | 2 | 23 |
564512 | 2014-04-30 23:26:00 | 40.7629 | -73.9672 | B02764 | 30 | 2 | 23 |
564513 | 2014-04-30 23:31:00 | 40.7443 | -73.9889 | B02764 | 30 | 2 | 23 |
564514 | 2014-04-30 23:32:00 | 40.6756 | -73.9405 | B02764 | 30 | 2 | 23 |
564515 | 2014-04-30 23:48:00 | 40.6880 | -73.9608 | B02764 | 30 | 2 | 23 |
Analyze the date of the month:
(array([52721., 59680., 52581., 58631., 45427., 56764., 38781., 60673.,
64697., 74561.]),
64697., 74561.]),
array([ 1. , 3.9, 6.8, 9.7, 12.6, 15.5, 18.4, 21.3, 24.2, 27.1, 30. ]),
hist(data.dom, bins = 30 )
(array([14546., 17474., 20701., 26714., 19521., 13445., 19550., 16188.,
16843., 20041., 20420., 18170., 12112., 12674., 20641., 17717.,
20973., 18074., 14602., 11017., 13162., 16975., 20346., 23352.,
25095., 24925., 14677., 15475., 22835., 36251.]),
array([ 1. , 1.96666667, 2.93333333, 3.9 , 4.86666667,
5.83333333, 6.8 , 7.76666667, 8.73333333, 9.7 ,
10.66666667, 11.63333333, 12.6 , 13.56666667, 14.53333333,
15.5 , 16.46666667, 17.43333333, 18.4 , 19.36666667,
20.33333333, 21.3 , 22.26666667, 23.23333333, 24.2 ,
25.16666667, 26.13333333, 27.1 , 28.06666667, 29.03333333,
30. ]),
hist(data.dom, bins = 30, rwidth=0.8)
(array([14546., 17474., 20701., 26714., 19521., 13445., 19550., 16188.,
16843., 20041., 20420., 18170., 12112., 12674., 20641., 17717.,
20973., 18074., 14602., 11017., 13162., 16975., 20346., 23352.,
25095., 24925., 14677., 15475., 22835., 36251.]),
array([ 1. , 1.96666667, 2.93333333, 3.9 , 4.86666667,
5.83333333, 6.8 , 7.76666667, 8.73333333, 9.7 ,
10.66666667, 11.63333333, 12.6 , 13.56666667, 14.53333333,
15.5 , 16.46666667, 17.43333333, 18.4 , 19.36666667,
20.33333333, 21.3 , 22.26666667, 23.23333333, 24.2 ,
25.16666667, 26.13333333, 27.1 , 28.06666667, 29.03333333,
30. ]),
hist(data.dom, bins= 30, rwidth=0.8, range=(0.5, 30.5))
xlabel('Date of the Month')
title('Frquency by Date of Month - Uber - April 2014')
for i, rows in data.groupby('dom'):
(1, 14546)
(2, 17474)
(3, 20701)
(4, 26714)
(5, 19521)
(6, 13445)
(7, 19550)
(8, 16188)
(9, 16843)
(10, 20041)
(11, 20420)
(12, 18170)
(13, 12112)
(14, 12674)
(15, 20641)
(16, 17717)
(17, 20973)
(18, 18074)
(19, 14602)
(20, 11017)
(21, 13162)
(22, 16975)
(23, 20346)
(24, 23352)
(25, 25095)
(26, 24925)
(27, 14677)
(28, 15475)
(29, 22835)
(30, 36251)
Another way of doing it
def count_rows(rows):
return len(rows)
by_date = data.groupby('dom').apply(count_rows)
1 14546
2 17474
3 20701
4 26714
5 19521
6 13445
7 19550
8 16188
9 16843
10 20041
11 20420
12 18170
13 12112
14 12674
15 20641
16 17717
17 20973
18 18074
19 14602
20 11017
21 13162
22 16975
23 20346
24 23352
25 25095
26 24925
27 14677
28 15475
29 22835
30 36251
let sort the data by date of the month and frequency of trips
by_date_sorted = by_date.sort_values()
20 11017
13 12112
14 12674
21 13162
6 13445
1 14546
19 14602
27 14677
28 15475
8 16188
9 16843
22 16975
2 17474
16 17717
18 18074
12 18170
5 19521
7 19550
10 20041
23 20346
11 20420
15 20641
3 20701
17 20973
29 22835
24 23352
26 24925
25 25095
4 26714
30 36251
bar(range(1,31), by_date_sorted)
xlabel('Date of the Month')
title('Frquency by Date of Month - Uber - April 2014')
Analysis of hours:
hist(data.hour, bins=24, range=(0.5,24))
(array([ 7769., 4935., 5040., 6095., 9476., 18498., 24924., 22843.,
17939., 17865., 18774., 19425., 22603., 27190., 35324., 42003.,
45475., 43003., 38923., 36244., 36964., 30645., 20649., 0.]),
array([ 0.5 , 1.47916667, 2.45833333, 3.4375 , 4.41666667,
5.39583333, 6.375 , 7.35416667, 8.33333333, 9.3125 ,
10.29166667, 11.27083333, 12.25 , 13.22916667, 14.20833333,
15.1875 , 16.16666667, 17.14583333, 18.125 , 19.10416667,
20.08333333, 21.0625 , 22.04166667, 23.02083333, 24. ]),
<a list of 24 Patch objects>)
Analysis of weekday:
hist(data.weekday, bins = 7, range=(-0.5,6.5), rwidth= 0.8, color= 'green')
xticks(range(7), 'Mon Tue Wed Thurs Fri Sat Sun'.split())
Analyzing hours and day of the week
data.groupby('hour weekday'.split()).apply(count_rows)
hour weekday
0 0 518
1 765
2 899
3 792
4 1367
5 3027
6 4542
1 0 261
1 367
2 507
3 459
4 760
5 2479
6 2936
2 0 238
1 304
2 371
3 342
4 513
5 1577
6 1590
3 0 571
1 516
2 585
3 567
4 736
5 1013
6 1052
4 0 1021
1 887
19 5 5529
6 2579
20 0 3573
1 6310
2 7783
3 6345
4 5165
5 4792
6 2276
21 0 3079
1 5993
2 6921
3 6585
4 6265
5 5811
6 2310
22 0 1976
1 3614
2 4845
3 5370
4 6708
5 6493
6 1639
23 0 1091
1 1948
2 2571
3 2909
4 5393
5 5719
6 1018
data.groupby('hour weekday'.split()).apply(count_rows).unstack()
weekday | 0 | 1 | 2 | 3 | 4 | 5 | 6 |
hour | |||||||
0 | 518 | 765 | 899 | 792 | 1367 | 3027 | 4542 |
1 | 261 | 367 | 507 | 459 | 760 | 2479 | 2936 |
2 | 238 | 304 | 371 | 342 | 513 | 1577 | 1590 |
3 | 571 | 516 | 585 | 567 | 736 | 1013 | 1052 |
4 | 1021 | 887 | 1003 | 861 | 932 | 706 | 685 |
5 | 1619 | 1734 | 1990 | 1454 | 1382 | 704 | 593 |
6 | 2974 | 3766 | 4230 | 3179 | 2836 | 844 | 669 |
7 | 3888 | 5304 | 5647 | 4159 | 3943 | 1110 | 873 |
8 | 3138 | 4594 | 5242 | 3616 | 3648 | 1372 | 1233 |
9 | 2211 | 2962 | 3846 | 2654 | 2732 | 1764 | 1770 |
10 | 1953 | 2900 | 3844 | 2370 | 2599 | 2086 | 2113 |
11 | 1929 | 2949 | 3889 | 2516 | 2816 | 2315 | 2360 |
12 | 1945 | 2819 | 3988 | 2657 | 2978 | 2560 | 2478 |
13 | 2294 | 3556 | 4469 | 3301 | 3535 | 2685 | 2763 |
14 | 3117 | 4489 | 5438 | 4083 | 4087 | 3042 | 2934 |
15 | 3818 | 6042 | 7071 | 5182 | 5354 | 4457 | 3400 |
16 | 4962 | 7521 | 8213 | 6149 | 6259 | 5410 | 3489 |
17 | 5574 | 8297 | 9151 | 6951 | 6790 | 5558 | 3154 |
18 | 4725 | 7089 | 8334 | 6637 | 7258 | 6165 | 2795 |
19 | 4386 | 6459 | 7794 | 5929 | 6247 | 5529 | 2579 |
20 | 3573 | 6310 | 7783 | 6345 | 5165 | 4792 | 2276 |
21 | 3079 | 5993 | 6921 | 6585 | 6265 | 5811 | 2310 |
22 | 1976 | 3614 | 4845 | 5370 | 6708 | 6493 | 1639 |
23 | 1091 | 1948 | 2571 | 2909 | 5393 | 5719 | 1018 |
by_cross1 = data.groupby('hour weekday'.split()).apply(count_rows).unstack()
data.groupby('weekday hour'.split()).apply(count_rows).unstack()
hour | 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | ... | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 |
weekday | |||||||||||||||||||||
0 | 518 | 261 | 238 | 571 | 1021 | 1619 | 2974 | 3888 | 3138 | 2211 | ... | 3117 | 3818 | 4962 | 5574 | 4725 | 4386 | 3573 | 3079 | 1976 | 1091 |
1 | 765 | 367 | 304 | 516 | 887 | 1734 | 3766 | 5304 | 4594 | 2962 | ... | 4489 | 6042 | 7521 | 8297 | 7089 | 6459 | 6310 | 5993 | 3614 | 1948 |
2 | 899 | 507 | 371 | 585 | 1003 | 1990 | 4230 | 5647 | 5242 | 3846 | ... | 5438 | 7071 | 8213 | 9151 | 8334 | 7794 | 7783 | 6921 | 4845 | 2571 |
3 | 792 | 459 | 342 | 567 | 861 | 1454 | 3179 | 4159 | 3616 | 2654 | ... | 4083 | 5182 | 6149 | 6951 | 6637 | 5929 | 6345 | 6585 | 5370 | 2909 |
4 | 1367 | 760 | 513 | 736 | 932 | 1382 | 2836 | 3943 | 3648 | 2732 | ... | 4087 | 5354 | 6259 | 6790 | 7258 | 6247 | 5165 | 6265 | 6708 | 5393 |
5 | 3027 | 2479 | 1577 | 1013 | 706 | 704 | 844 | 1110 | 1372 | 1764 | ... | 3042 | 4457 | 5410 | 5558 | 6165 | 5529 | 4792 | 5811 | 6493 | 5719 |
6 | 4542 | 2936 | 1590 | 1052 | 685 | 593 | 669 | 873 | 1233 | 1770 | ... | 2934 | 3400 | 3489 | 3154 | 2795 | 2579 | 2276 | 2310 | 1639 | 1018 |
by_cross = data.groupby('weekday hour'.split()).apply(count_rows).unstack()
Analysis of lat and long:
hist(data['Lat'], bins= 100, range =(40.5,41))
hist(data['Lon'], bins = 100, range=(-74.5, -73.5))
we can combine both of them
hist(data['Lat'], bins= 100, range =(40.5,41), color='r')
hist(data['Lon'], bins = 100, range=(-74.5, -73.5), color ='g')
hist(data['Lon'], bins = 100, range=(-74.5, -73.5), color ='g', alpha=0.5 )
hist(data['Lat'], bins= 100, range =(40.5,41), color='r', alpha = 0.5)
hist(data['Lon'], bins = 100, range=(-74.5, -73.5), color ='g', alpha=0.5, label="Longitudinal" )
legend(loc= 'upper right')
hist(data['Lat'], bins= 100, range =(40.5,41), color='r', alpha = 0.5, label='Latitude')
legend(loc = 'upper left')
plot(data['Lat'], '.')
plot(data['Lon'], data['Lat'])
plot(data['Lon'], data['Lat'],'.')
plot(data['Lon'], data['Lat'],'.', ms=1)
plot(data['Lon'], data['Lat'],'.', ms=1, alpha= 0.5)
plot(data['Lon'], data['Lat'],'.', ms=1, alpha= 0.5)
plot(data['Lon'], data['Lat'],'.', ms=1, alpha= 0.5)
plot(data['Lon'], data['Lat'],'.', ms=0.1, alpha= 0.5)
plot(data['Lon'], data['Lat'],'.', ms=1, alpha= 0.5)
(40.65, 40.8)