Analyzing Uber Data
%pylab inline
import pandas
import seaborn
Populating the interactive namespace from numpy and matplotlib
data = pandas.read_csv('Desktop/uber-raw-data-apr14.txt')
Explore your data
data.head()
| Date/Time | Lat | Lon | Base | |
|---|---|---|---|---|
| 0 | 4/1/2014 0:11:00 | 40.7690 | -73.9549 | B02512 | 
| 1 | 4/1/2014 0:17:00 | 40.7267 | -74.0345 | B02512 | 
| 2 | 4/1/2014 0:21:00 | 40.7316 | -73.9873 | B02512 | 
| 3 | 4/1/2014 0:28:00 | 40.7588 | -73.9776 | B02512 | 
| 4 | 4/1/2014 0:33:00 | 40.7594 | -73.9722 | B02512 | 
Date and time are string, we need to conver them to integer:
dt = '4/1/2014 0:11:00'
dt
'4/1/2014 0:11:00'
d,t = dt.split(' ')
print(d)
print(t)
4/1/2014
0:11:00
m,d,y=d.split('/')
d
'1'
int(d)
1
Another way to do it:
dt=pandas.to_datetime(dt)
dt
Timestamp('2014-04-01 00:11:00')
here we have nice functions to play with
dt.weekday_name
C:\Users\hamad\AppData\Local\Continuum\anaconda3\lib\site-packages\ipykernel_launcher.py:1: FutureWarning: `weekday_name` is deprecated and will be removed in a future version. Use `day_name` instead
  """Entry point for launching an IPython kernel.
'Tuesday'
dt.weekday()
1
Now we convert the whole column:
data['Date/Time']= data['Date/Time'].map(pandas.to_datetime)
data.head()
| Date/Time | Lat | Lon | Base | |
|---|---|---|---|---|
| 0 | 2014-04-01 00:11:00 | 40.7690 | -73.9549 | B02512 | 
| 1 | 2014-04-01 00:17:00 | 40.7267 | -74.0345 | B02512 | 
| 2 | 2014-04-01 00:21:00 | 40.7316 | -73.9873 | B02512 | 
| 3 | 2014-04-01 00:28:00 | 40.7588 | -73.9776 | B02512 | 
| 4 | 2014-04-01 00:33:00 | 40.7594 | -73.9722 | B02512 | 
data['Date/Time'][0]
Timestamp('2014-04-01 00:11:00')
Let us create a function to return back date of the month
def getdom(dt):
    return dt.day
data['dom'] = data['Date/Time'].map(getdom)
data.tail()
| Date/Time | Lat | Lon | Base | dom | |
|---|---|---|---|---|---|
| 564511 | 2014-04-30 23:22:00 | 40.7640 | -73.9744 | B02764 | 30 | 
| 564512 | 2014-04-30 23:26:00 | 40.7629 | -73.9672 | B02764 | 30 | 
| 564513 | 2014-04-30 23:31:00 | 40.7443 | -73.9889 | B02764 | 30 | 
| 564514 | 2014-04-30 23:32:00 | 40.6756 | -73.9405 | B02764 | 30 | 
| 564515 | 2014-04-30 23:48:00 | 40.6880 | -73.9608 | B02764 | 30 | 
data.head()
| Date/Time | Lat | Lon | Base | dom | |
|---|---|---|---|---|---|
| 0 | 2014-04-01 00:11:00 | 40.7690 | -73.9549 | B02512 | 1 | 
| 1 | 2014-04-01 00:17:00 | 40.7267 | -74.0345 | B02512 | 1 | 
| 2 | 2014-04-01 00:21:00 | 40.7316 | -73.9873 | B02512 | 1 | 
| 3 | 2014-04-01 00:28:00 | 40.7588 | -73.9776 | B02512 | 1 | 
| 4 | 2014-04-01 00:33:00 | 40.7594 | -73.9722 | B02512 | 1 | 
Create another function for the weekday
def getwkday(dt):
    return dt.weekday()
data['weekday']= data['Date/Time'].map(getwkday)
Create a function to get the hour of the day
def gethr(dt):
    return dt.hour
data['hour']=data['Date/Time'].map(gethr)
data.head()
| Date/Time | Lat | Lon | Base | dom | weekday | hour | |
|---|---|---|---|---|---|---|---|
| 0 | 2014-04-01 00:11:00 | 40.7690 | -73.9549 | B02512 | 1 | 1 | 0 | 
| 1 | 2014-04-01 00:17:00 | 40.7267 | -74.0345 | B02512 | 1 | 1 | 0 | 
| 2 | 2014-04-01 00:21:00 | 40.7316 | -73.9873 | B02512 | 1 | 1 | 0 | 
| 3 | 2014-04-01 00:28:00 | 40.7588 | -73.9776 | B02512 | 1 | 1 | 0 | 
| 4 | 2014-04-01 00:33:00 | 40.7594 | -73.9722 | B02512 | 1 | 1 | 0 | 
data.tail()
| Date/Time | Lat | Lon | Base | dom | weekday | hour | |
|---|---|---|---|---|---|---|---|
| 564511 | 2014-04-30 23:22:00 | 40.7640 | -73.9744 | B02764 | 30 | 2 | 23 | 
| 564512 | 2014-04-30 23:26:00 | 40.7629 | -73.9672 | B02764 | 30 | 2 | 23 | 
| 564513 | 2014-04-30 23:31:00 | 40.7443 | -73.9889 | B02764 | 30 | 2 | 23 | 
| 564514 | 2014-04-30 23:32:00 | 40.6756 | -73.9405 | B02764 | 30 | 2 | 23 | 
| 564515 | 2014-04-30 23:48:00 | 40.6880 | -73.9608 | B02764 | 30 | 2 | 23 | 
Analysis:
Analyze the date of the month:
hist(data.dom)
(array([52721., 59680., 52581., 58631., 45427., 56764., 38781., 60673.,
        64697., 74561.]),
 array([ 1. ,  3.9,  6.8,  9.7, 12.6, 15.5, 18.4, 21.3, 24.2, 27.1, 30. ]),
 <a list of 10 Patch objects>)

hist(data.dom, bins = 30 )
(array([14546., 17474., 20701., 26714., 19521., 13445., 19550., 16188.,
        16843., 20041., 20420., 18170., 12112., 12674., 20641., 17717.,
        20973., 18074., 14602., 11017., 13162., 16975., 20346., 23352.,
        25095., 24925., 14677., 15475., 22835., 36251.]),
 array([ 1.        ,  1.96666667,  2.93333333,  3.9       ,  4.86666667,
         5.83333333,  6.8       ,  7.76666667,  8.73333333,  9.7       ,
        10.66666667, 11.63333333, 12.6       , 13.56666667, 14.53333333,
        15.5       , 16.46666667, 17.43333333, 18.4       , 19.36666667,
        20.33333333, 21.3       , 22.26666667, 23.23333333, 24.2       ,
        25.16666667, 26.13333333, 27.1       , 28.06666667, 29.03333333,
        30.        ]),
 <a list of 30 Patch objects>)

hist(data.dom, bins = 30, rwidth=0.8)
(array([14546., 17474., 20701., 26714., 19521., 13445., 19550., 16188.,
        16843., 20041., 20420., 18170., 12112., 12674., 20641., 17717.,
        20973., 18074., 14602., 11017., 13162., 16975., 20346., 23352.,
        25095., 24925., 14677., 15475., 22835., 36251.]),
 array([ 1.        ,  1.96666667,  2.93333333,  3.9       ,  4.86666667,
         5.83333333,  6.8       ,  7.76666667,  8.73333333,  9.7       ,
        10.66666667, 11.63333333, 12.6       , 13.56666667, 14.53333333,
        15.5       , 16.46666667, 17.43333333, 18.4       , 19.36666667,
        20.33333333, 21.3       , 22.26666667, 23.23333333, 24.2       ,
        25.16666667, 26.13333333, 27.1       , 28.06666667, 29.03333333,
        30.        ]),
 <a list of 30 Patch objects>)

hist(data.dom, bins= 30, rwidth=0.8, range=(0.5, 30.5))
xlabel('Date of the Month')
ylabel('Frequency')
title('Frquency by Date of Month - Uber - April 2014')
Text(0.5,1,'Frquency by Date of Month - Uber - April 2014')

for i, rows in data.groupby('dom'):
    print((i,len(rows)))
(1, 14546)
(2, 17474)
(3, 20701)
(4, 26714)
(5, 19521)
(6, 13445)
(7, 19550)
(8, 16188)
(9, 16843)
(10, 20041)
(11, 20420)
(12, 18170)
(13, 12112)
(14, 12674)
(15, 20641)
(16, 17717)
(17, 20973)
(18, 18074)
(19, 14602)
(20, 11017)
(21, 13162)
(22, 16975)
(23, 20346)
(24, 23352)
(25, 25095)
(26, 24925)
(27, 14677)
(28, 15475)
(29, 22835)
(30, 36251)
Another way of doing it
def count_rows(rows):
    return len(rows)
by_date = data.groupby('dom').apply(count_rows)
by_date
dom
1     14546
2     17474
3     20701
4     26714
5     19521
6     13445
7     19550
8     16188
9     16843
10    20041
11    20420
12    18170
13    12112
14    12674
15    20641
16    17717
17    20973
18    18074
19    14602
20    11017
21    13162
22    16975
23    20346
24    23352
25    25095
26    24925
27    14677
28    15475
29    22835
30    36251
dtype: int64
plot(by_date)
[<matplotlib.lines.Line2D at 0x1e85e54eac8>]

let sort the data by date of the month and frequency of trips
by_date_sorted = by_date.sort_values()
by_date_sorted
dom
20    11017
13    12112
14    12674
21    13162
6     13445
1     14546
19    14602
27    14677
28    15475
8     16188
9     16843
22    16975
2     17474
16    17717
18    18074
12    18170
5     19521
7     19550
10    20041
23    20346
11    20420
15    20641
3     20701
17    20973
29    22835
24    23352
26    24925
25    25095
4     26714
30    36251
dtype: int64
bar(range(1,31), by_date_sorted)
xticks(range(1,31),by_date_sorted.index)
xlabel('Date of the Month')
ylabel('Frequency')
title('Frquency by Date of Month - Uber - April 2014')
;
''

Analysis of hours:
hist(data.hour, bins=24, range=(0.5,24))
(array([ 7769.,  4935.,  5040.,  6095.,  9476., 18498., 24924., 22843.,
        17939., 17865., 18774., 19425., 22603., 27190., 35324., 42003.,
        45475., 43003., 38923., 36244., 36964., 30645., 20649.,     0.]),
 array([ 0.5       ,  1.47916667,  2.45833333,  3.4375    ,  4.41666667,
         5.39583333,  6.375     ,  7.35416667,  8.33333333,  9.3125    ,
        10.29166667, 11.27083333, 12.25      , 13.22916667, 14.20833333,
        15.1875    , 16.16666667, 17.14583333, 18.125     , 19.10416667,
        20.08333333, 21.0625    , 22.04166667, 23.02083333, 24.        ]),
 <a list of 24 Patch objects>)

Analysis of weekday:
hist(data.weekday, bins = 7, range=(-0.5,6.5), rwidth= 0.8, color= 'green')
xticks(range(7), 'Mon Tue Wed Thurs Fri Sat Sun'.split())
([<matplotlib.axis.XTick at 0x1e85e996630>,
  <matplotlib.axis.XTick at 0x1e85e9a8e48>,
  <matplotlib.axis.XTick at 0x1e85e9a8860>,
  <matplotlib.axis.XTick at 0x1e85e923160>,
  <matplotlib.axis.XTick at 0x1e85e923588>,
  <matplotlib.axis.XTick at 0x1e85e923a58>,
  <matplotlib.axis.XTick at 0x1e85e923f28>],
 <a list of 7 Text xticklabel objects>)

Analyzing hours and day of the week
count_rows(data)
564516
data.groupby('hour weekday'.split()).apply(count_rows)
hour  weekday
0     0           518
      1           765
      2           899
      3           792
      4          1367
      5          3027
      6          4542
1     0           261
      1           367
      2           507
      3           459
      4           760
      5          2479
      6          2936
2     0           238
      1           304
      2           371
      3           342
      4           513
      5          1577
      6          1590
3     0           571
      1           516
      2           585
      3           567
      4           736
      5          1013
      6          1052
4     0          1021
      1           887
                 ...
19    5          5529
      6          2579
20    0          3573
      1          6310
      2          7783
      3          6345
      4          5165
      5          4792
      6          2276
21    0          3079
      1          5993
      2          6921
      3          6585
      4          6265
      5          5811
      6          2310
22    0          1976
      1          3614
      2          4845
      3          5370
      4          6708
      5          6493
      6          1639
23    0          1091
      1          1948
      2          2571
      3          2909
      4          5393
      5          5719
      6          1018
Length: 168, dtype: int64
data.groupby('hour weekday'.split()).apply(count_rows).unstack()
| weekday | 0 | 1 | 2 | 3 | 4 | 5 | 6 | 
|---|---|---|---|---|---|---|---|
| hour | |||||||
| 0 | 518 | 765 | 899 | 792 | 1367 | 3027 | 4542 | 
| 1 | 261 | 367 | 507 | 459 | 760 | 2479 | 2936 | 
| 2 | 238 | 304 | 371 | 342 | 513 | 1577 | 1590 | 
| 3 | 571 | 516 | 585 | 567 | 736 | 1013 | 1052 | 
| 4 | 1021 | 887 | 1003 | 861 | 932 | 706 | 685 | 
| 5 | 1619 | 1734 | 1990 | 1454 | 1382 | 704 | 593 | 
| 6 | 2974 | 3766 | 4230 | 3179 | 2836 | 844 | 669 | 
| 7 | 3888 | 5304 | 5647 | 4159 | 3943 | 1110 | 873 | 
| 8 | 3138 | 4594 | 5242 | 3616 | 3648 | 1372 | 1233 | 
| 9 | 2211 | 2962 | 3846 | 2654 | 2732 | 1764 | 1770 | 
| 10 | 1953 | 2900 | 3844 | 2370 | 2599 | 2086 | 2113 | 
| 11 | 1929 | 2949 | 3889 | 2516 | 2816 | 2315 | 2360 | 
| 12 | 1945 | 2819 | 3988 | 2657 | 2978 | 2560 | 2478 | 
| 13 | 2294 | 3556 | 4469 | 3301 | 3535 | 2685 | 2763 | 
| 14 | 3117 | 4489 | 5438 | 4083 | 4087 | 3042 | 2934 | 
| 15 | 3818 | 6042 | 7071 | 5182 | 5354 | 4457 | 3400 | 
| 16 | 4962 | 7521 | 8213 | 6149 | 6259 | 5410 | 3489 | 
| 17 | 5574 | 8297 | 9151 | 6951 | 6790 | 5558 | 3154 | 
| 18 | 4725 | 7089 | 8334 | 6637 | 7258 | 6165 | 2795 | 
| 19 | 4386 | 6459 | 7794 | 5929 | 6247 | 5529 | 2579 | 
| 20 | 3573 | 6310 | 7783 | 6345 | 5165 | 4792 | 2276 | 
| 21 | 3079 | 5993 | 6921 | 6585 | 6265 | 5811 | 2310 | 
| 22 | 1976 | 3614 | 4845 | 5370 | 6708 | 6493 | 1639 | 
| 23 | 1091 | 1948 | 2571 | 2909 | 5393 | 5719 | 1018 | 
by_cross1 = data.groupby('hour weekday'.split()).apply(count_rows).unstack()
seaborn.heatmap(by_cross1)
<matplotlib.axes._subplots.AxesSubplot at 0x1e85dcdb9b0>

data.groupby('weekday hour'.split()).apply(count_rows).unstack()
| hour | 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | ... | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| weekday | |||||||||||||||||||||
| 0 | 518 | 261 | 238 | 571 | 1021 | 1619 | 2974 | 3888 | 3138 | 2211 | ... | 3117 | 3818 | 4962 | 5574 | 4725 | 4386 | 3573 | 3079 | 1976 | 1091 | 
| 1 | 765 | 367 | 304 | 516 | 887 | 1734 | 3766 | 5304 | 4594 | 2962 | ... | 4489 | 6042 | 7521 | 8297 | 7089 | 6459 | 6310 | 5993 | 3614 | 1948 | 
| 2 | 899 | 507 | 371 | 585 | 1003 | 1990 | 4230 | 5647 | 5242 | 3846 | ... | 5438 | 7071 | 8213 | 9151 | 8334 | 7794 | 7783 | 6921 | 4845 | 2571 | 
| 3 | 792 | 459 | 342 | 567 | 861 | 1454 | 3179 | 4159 | 3616 | 2654 | ... | 4083 | 5182 | 6149 | 6951 | 6637 | 5929 | 6345 | 6585 | 5370 | 2909 | 
| 4 | 1367 | 760 | 513 | 736 | 932 | 1382 | 2836 | 3943 | 3648 | 2732 | ... | 4087 | 5354 | 6259 | 6790 | 7258 | 6247 | 5165 | 6265 | 6708 | 5393 | 
| 5 | 3027 | 2479 | 1577 | 1013 | 706 | 704 | 844 | 1110 | 1372 | 1764 | ... | 3042 | 4457 | 5410 | 5558 | 6165 | 5529 | 4792 | 5811 | 6493 | 5719 | 
| 6 | 4542 | 2936 | 1590 | 1052 | 685 | 593 | 669 | 873 | 1233 | 1770 | ... | 2934 | 3400 | 3489 | 3154 | 2795 | 2579 | 2276 | 2310 | 1639 | 1018 | 
7 rows × 24 columns
by_cross = data.groupby('weekday hour'.split()).apply(count_rows).unstack()
seaborn.heatmap(by_cross)
<matplotlib.axes._subplots.AxesSubplot at 0x1e85ddb2be0>

seaborn.clustermap(by_cross)
<seaborn.matrix.ClusterGrid at 0x1e85dc35710>

Analysis of lat and long:
hist(data['Lat'], bins= 100, range =(40.5,41))
;
''

hist(data['Lon'], bins = 100, range=(-74.5, -73.5))
;
''

we can combine both of them
hist(data['Lat'], bins= 100, range =(40.5,41), color='r')
twiny()
hist(data['Lon'], bins = 100, range=(-74.5, -73.5), color ='g')
;
''

hist(data['Lon'], bins = 100, range=(-74.5, -73.5), color ='g', alpha=0.5 )
twiny()
hist(data['Lat'], bins= 100, range =(40.5,41), color='r', alpha = 0.5)
;
''

hist(data['Lon'], bins = 100, range=(-74.5, -73.5), color ='g', alpha=0.5, label="Longitudinal" )
grid()
legend(loc= 'upper right')
twiny()
hist(data['Lat'], bins= 100, range =(40.5,41), color='r', alpha = 0.5, label='Latitude')
grid()
legend(loc = 'upper left')
;
''

plot(data['Lat'])
[<matplotlib.lines.Line2D at 0x1e85cbdabe0>]

plot(data['Lat'])
xlim(0,100)
(0, 100)

plot(data['Lat'], '.')
xlim(0,100)
(0, 100)

plot(data['Lon'], data['Lat'])
[<matplotlib.lines.Line2D at 0x1e86149eac8>]

plot(data['Lon'], data['Lat'],'.')
[<matplotlib.lines.Line2D at 0x1e86119d550>]

plot(data['Lon'], data['Lat'],'.', ms=1)
[<matplotlib.lines.Line2D at 0x1e8614bdb38>]

plot(data['Lon'], data['Lat'],'.', ms=1, alpha= 0.5)
[<matplotlib.lines.Line2D at 0x1e85c717748>]

plot(data['Lon'], data['Lat'],'.', ms=1, alpha= 0.5)
xlim(-74.1,-73.4)
(-74.1, -73.4)

plot(data['Lon'], data['Lat'],'.', ms=1, alpha= 0.5)
xlim(-74.1,-73.4)
ylim(40.3,41.2)
(40.3, 41.2)

figure(figsize=(20,20))
plot(data['Lon'], data['Lat'],'.', ms=0.1, alpha= 0.5)
xlim(-74.2,-73.7)
ylim(40.5,41)
(40.5, 41)

figure(figsize=(20,20))
plot(data['Lon'], data['Lat'],'.', ms=1, alpha= 0.5)
xlim(-74.05,-73.80)
ylim(40.65,40.80)
(40.65, 40.8)

 
  
  
