Using SQL in Python, Citi Bike Analysis, as a Case Study

43 minute read

import pandas as pd
import sqlite3 as sql

read csv from url

pd.read_csv('https://s3.amazonaws.com/tripdata/JC-201705-citibike-tripdata.csv.zip',nrows=10)
tripduration starttime stoptime start station id start station name start station latitude start station longitude end station id end station name end station latitude end station longitude bikeid usertype birth year gender
0 347 2017-05-01 00:03:19 2017-05-01 00:09:06 3276 Marin Light Rail 40.714584 -74.042817 3214 Essex Light Rail 40.712774 -74.036486 26177 Subscriber 1958 1
1 108 2017-05-01 00:16:04 2017-05-01 00:17:53 3275 Columbus Drive 40.718355 -74.038914 3187 Warren St 40.721124 -74.038051 26266 Subscriber 1963 1
2 165 2017-05-01 00:29:59 2017-05-01 00:32:44 3267 Morris Canal 40.712419 -74.038526 3275 Columbus Drive 40.718355 -74.038914 26168 Subscriber 1988 1
3 289 2017-05-01 00:30:47 2017-05-01 00:35:36 3209 Brunswick St 40.724176 -74.050656 3205 JC Medical Center 40.716540 -74.049638 26273 Subscriber 1986 2
4 115 2017-05-01 00:41:14 2017-05-01 00:43:09 3185 City Hall 40.717733 -74.043845 3213 Van Vorst Park 40.718489 -74.047727 26293 Subscriber 1983 1
5 285 2017-05-01 00:47:59 2017-05-01 00:52:44 3205 JC Medical Center 40.716540 -74.049638 3209 Brunswick St 40.724176 -74.050656 26273 Subscriber 1986 2
6 163 2017-05-01 04:46:48 2017-05-01 04:49:32 3267 Morris Canal 40.712419 -74.038526 3183 Exchange Place 40.716247 -74.033459 29645 Subscriber 1980 1
7 876 2017-05-01 05:02:30 2017-05-01 05:17:06 3207 Oakland Ave 40.737604 -74.052478 3185 City Hall 40.717733 -74.043845 29272 Subscriber 1984 2
8 394 2017-05-01 05:17:09 2017-05-01 05:23:43 3202 Newport PATH 40.727224 -74.033759 3272 Jersey & 3rd 40.723332 -74.045953 26211 Subscriber 1976 1
9 164 2017-05-01 05:31:28 2017-05-01 05:34:12 3214 Essex Light Rail 40.712774 -74.036486 3183 Exchange Place 40.716247 -74.033459 19318 Subscriber 1979 1

read csv into sqlite

db = sql.connect('citibike.db')
pd.DataFrame.to_sql?
url = 'https://s3.amazonaws.com/tripdata/20170%s-citibike-tripdata.csv.zip'
for i in [5,6,7,8,9]:
    print(url % i)
    chunks = pd.read_csv(url %i, chunksize=100_000)
    for chunk in chunks:
        chunk.columns = [column.replace(' ','_') for column in chunk.columns]
        chunk.to_sql('tripdata', db,if_exists='append')
https://s3.amazonaws.com/tripdata/201705-citibike-tripdata.csv.zip
https://s3.amazonaws.com/tripdata/201706-citibike-tripdata.csv.zip
https://s3.amazonaws.com/tripdata/201707-citibike-tripdata.csv.zip
https://s3.amazonaws.com/tripdata/201708-citibike-tripdata.csv.zip
https://s3.amazonaws.com/tripdata/201709-citibike-tripdata.csv.zip
pd.read_sql_query('SELECT count(*) FROM tripdata',db)
count(*)
0 8685057
pd.read_sql_query('SELECT * FROM tripdata LIMIT 10',db)
index tripduration starttime stoptime start_station_id start_station_name start_station_latitude start_station_longitude end_station_id end_station_name end_station_latitude end_station_longitude bikeid usertype birth_year gender
0 0 254 2017-05-01 00:00:13 2017-05-01 00:04:27 511 E 14 St & Avenue B 40.729387 -73.977724 394 E 9 St & Avenue C 40.725213 -73.977688 27695 Subscriber 1996.0 2
1 1 248 2017-05-01 00:00:19 2017-05-01 00:04:28 511 E 14 St & Avenue B 40.729387 -73.977724 394 E 9 St & Avenue C 40.725213 -73.977688 15869 Subscriber 1996.0 1
2 2 1120 2017-05-01 00:00:19 2017-05-01 00:19:00 242 Carlton Ave & Flushing Ave 40.697787 -73.973736 3083 Bushwick Ave & Powers St 40.712477 -73.941000 18700 Subscriber 1985.0 2
3 3 212 2017-05-01 00:00:24 2017-05-01 00:03:56 168 W 18 St & 6 Ave 40.739713 -73.994564 116 W 17 St & 8 Ave 40.741776 -74.001497 24981 Subscriber 1993.0 1
4 4 686 2017-05-01 00:00:29 2017-05-01 00:11:55 494 W 26 St & 8 Ave 40.747348 -73.997236 527 E 33 St & 2 Ave 40.744023 -73.976056 25407 Subscriber 1964.0 1
5 5 577 2017-05-01 00:00:35 2017-05-01 00:10:12 334 W 20 St & 7 Ave 40.742388 -73.997262 504 1 Ave & E 16 St 40.732219 -73.981656 28713 Subscriber 1956.0 2
6 6 523 2017-05-01 00:00:41 2017-05-01 00:09:24 335 Washington Pl & Broadway 40.729039 -73.994046 487 E 20 St & FDR Drive 40.733143 -73.975739 15385 Subscriber 1994.0 1
7 7 419 2017-05-01 00:00:45 2017-05-01 00:07:44 336 Sullivan St & Washington Sq 40.730477 -73.999061 369 Washington Pl & 6 Ave 40.732241 -74.000264 18295 Customer NaN 0
8 8 518 2017-05-01 00:00:47 2017-05-01 00:09:25 335 Washington Pl & Broadway 40.729039 -73.994046 487 E 20 St & FDR Drive 40.733143 -73.975739 15608 Subscriber 1995.0 1
9 9 1296 2017-05-01 00:00:48 2017-05-01 00:22:25 291 Madison St & Montgomery St 40.713126 -73.984844 291 Madison St & Montgomery St 40.713126 -73.984844 17351 Subscriber 1994.0 1
pd.read_sql_query('SELECT min(birth_year), max(birth_year) FROM tripdata',db)
min(birth_year) max(birth_year)
0 1874.0 2001.0
pd.read_sql_query('SELECT min(tripduration),max(tripduration) FROM tripdata',db)
min(tripduration) max(tripduration)
0 61 5807661
pd.read_sql_query('SELECT avg(tripduration) FROM tripdata',db)
avg(tripduration)
0 1055.770104
res =pd.read_sql_query('SELECT * FROM tripdata',db,chunksize=100_000)
next(res)
index tripduration starttime stoptime start_station_id start_station_name start_station_latitude start_station_longitude end_station_id end_station_name end_station_latitude end_station_longitude bikeid usertype birth_year gender
0 0 254 2017-05-01 00:00:13 2017-05-01 00:04:27 511 E 14 St & Avenue B 40.729387 -73.977724 394 E 9 St & Avenue C 40.725213 -73.977688 27695 Subscriber 1996.0 2
1 1 248 2017-05-01 00:00:19 2017-05-01 00:04:28 511 E 14 St & Avenue B 40.729387 -73.977724 394 E 9 St & Avenue C 40.725213 -73.977688 15869 Subscriber 1996.0 1
2 2 1120 2017-05-01 00:00:19 2017-05-01 00:19:00 242 Carlton Ave & Flushing Ave 40.697787 -73.973736 3083 Bushwick Ave & Powers St 40.712477 -73.941000 18700 Subscriber 1985.0 2
3 3 212 2017-05-01 00:00:24 2017-05-01 00:03:56 168 W 18 St & 6 Ave 40.739713 -73.994564 116 W 17 St & 8 Ave 40.741776 -74.001497 24981 Subscriber 1993.0 1
4 4 686 2017-05-01 00:00:29 2017-05-01 00:11:55 494 W 26 St & 8 Ave 40.747348 -73.997236 527 E 33 St & 2 Ave 40.744023 -73.976056 25407 Subscriber 1964.0 1
5 5 577 2017-05-01 00:00:35 2017-05-01 00:10:12 334 W 20 St & 7 Ave 40.742388 -73.997262 504 1 Ave & E 16 St 40.732219 -73.981656 28713 Subscriber 1956.0 2
6 6 523 2017-05-01 00:00:41 2017-05-01 00:09:24 335 Washington Pl & Broadway 40.729039 -73.994046 487 E 20 St & FDR Drive 40.733143 -73.975739 15385 Subscriber 1994.0 1
7 7 419 2017-05-01 00:00:45 2017-05-01 00:07:44 336 Sullivan St & Washington Sq 40.730477 -73.999061 369 Washington Pl & 6 Ave 40.732241 -74.000264 18295 Customer NaN 0
8 8 518 2017-05-01 00:00:47 2017-05-01 00:09:25 335 Washington Pl & Broadway 40.729039 -73.994046 487 E 20 St & FDR Drive 40.733143 -73.975739 15608 Subscriber 1995.0 1
9 9 1296 2017-05-01 00:00:48 2017-05-01 00:22:25 291 Madison St & Montgomery St 40.713126 -73.984844 291 Madison St & Montgomery St 40.713126 -73.984844 17351 Subscriber 1994.0 1
10 10 418 2017-05-01 00:00:59 2017-05-01 00:07:58 336 Sullivan St & Washington Sq 40.730477 -73.999061 369 Washington Pl & 6 Ave 40.732241 -74.000264 28237 Customer NaN 0
11 11 589 2017-05-01 00:01:00 2017-05-01 00:10:50 415 Pearl St & Hanover Square 40.704718 -74.009260 340 Madison St & Clinton St 40.712690 -73.987763 15289 Subscriber 1952.0 1
12 12 735 2017-05-01 00:01:03 2017-05-01 00:13:18 330 Reade St & Broadway 40.714505 -74.005628 312 Allen St & Stanton St 40.722055 -73.989111 27270 Subscriber 1993.0 1
13 13 950 2017-05-01 00:01:00 2017-05-01 00:16:50 439 E 4 St & 2 Ave 40.726281 -73.989780 293 Lafayette St & E 8 St 40.730207 -73.991026 18740 Customer NaN 0
14 14 2205 2017-05-01 00:01:03 2017-05-01 00:37:49 306 Cliff St & Fulton St 40.708235 -74.005301 306 Cliff St & Fulton St 40.708235 -74.005301 17746 Subscriber 1986.0 1
15 15 391 2017-05-01 00:01:10 2017-05-01 00:07:41 3429 Hanson Pl & Ashland Pl 40.685068 -73.977908 419 Carlton Ave & Park Ave 40.695807 -73.973556 26118 Subscriber 1984.0 1
16 16 320 2017-05-01 00:01:11 2017-05-01 00:06:32 387 Centre St & Chambers St 40.712733 -74.004607 2008 Little West St & 1 Pl 40.705693 -74.016777 26691 Subscriber 1992.0 1
17 17 273 2017-05-01 00:01:11 2017-05-01 00:05:45 3090 N 8 St & Driggs Ave 40.717746 -73.956001 3111 Norman Ave & Leonard St - 2 40.725848 -73.950649 28209 Subscriber 1988.0 1
18 18 461 2017-05-01 00:01:40 2017-05-01 00:09:21 357 E 11 St & Broadway 40.732618 -73.991580 301 E 2 St & Avenue B 40.722174 -73.983688 27536 Subscriber 1994.0 1
19 19 345 2017-05-01 00:01:46 2017-05-01 00:07:32 350 Clinton St & Grand St 40.715595 -73.987030 511 E 14 St & Avenue B 40.729387 -73.977724 29347 Subscriber 1974.0 1
20 20 616 2017-05-01 00:02:01 2017-05-01 00:12:17 3074 Montrose Ave & Bushwick Ave 40.707678 -73.940162 3107 Bedford Ave & Nassau Ave 40.723117 -73.952123 27667 Subscriber 1980.0 2
21 21 552 2017-05-01 00:02:04 2017-05-01 00:11:16 446 W 24 St & 7 Ave 40.744876 -73.995299 368 Carmine St & 6 Ave 40.730386 -74.002150 28913 Subscriber 1985.0 1
22 22 628 2017-05-01 00:02:14 2017-05-01 00:12:42 434 9 Ave & W 18 St 40.743174 -74.003664 236 St Marks Pl & 2 Ave 40.728419 -73.987140 29346 Subscriber 1996.0 1
23 23 156 2017-05-01 00:02:18 2017-05-01 00:04:55 3428 8 Ave & W 16 St 40.740983 -74.001702 509 9 Ave & W 22 St 40.745497 -74.001971 27170 Subscriber 1981.0 1
24 24 305 2017-05-01 00:02:27 2017-05-01 00:07:33 504 1 Ave & E 16 St 40.732219 -73.981656 403 E 2 St & 2 Ave 40.725029 -73.990697 21322 Subscriber 1997.0 1
25 25 372 2017-05-01 00:02:47 2017-05-01 00:08:59 387 Centre St & Chambers St 40.712733 -74.004607 2008 Little West St & 1 Pl 40.705693 -74.016777 25564 Subscriber 1982.0 1
26 26 161 2017-05-01 00:02:54 2017-05-01 00:05:35 399 Lafayette Ave & St James Pl 40.688515 -73.964763 384 Fulton St & Washington Ave 40.683048 -73.964915 20545 Subscriber 1987.0 1
27 27 227 2017-05-01 00:02:55 2017-05-01 00:06:42 508 W 46 St & 11 Ave 40.763414 -73.996674 450 W 49 St & 8 Ave 40.762272 -73.987882 26622 Subscriber 1981.0 1
28 28 1636 2017-05-01 00:03:12 2017-05-01 00:30:29 477 W 41 St & 8 Ave 40.756405 -73.990026 410 Suffolk St & Stanton St 40.720664 -73.985180 24814 Subscriber 1962.0 2
29 29 1667 2017-05-01 00:03:16 2017-05-01 00:31:03 173 Broadway & W 49 St 40.760683 -73.984527 358 Christopher St & Greenwich St 40.732916 -74.007114 26887 Customer NaN 0
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
99970 99970 1606 2017-05-02 18:04:12 2017-05-02 18:30:59 168 W 18 St & 6 Ave 40.739713 -73.994564 360 William St & Pine St 40.707179 -74.008873 25697 Subscriber 1988.0 2
99971 99971 325 2017-05-02 18:04:15 2017-05-02 18:09:41 533 Broadway & W 39 St 40.752996 -73.987216 533 Broadway & W 39 St 40.752996 -73.987216 26634 Subscriber 1980.0 1
99972 99972 468 2017-05-02 18:04:23 2017-05-02 18:12:11 3440 Fulton St & Adams St 40.692418 -73.989495 467 Dean St & 4 Ave 40.683125 -73.978951 25152 Subscriber 1963.0 1
99973 99973 974 2017-05-02 18:04:16 2017-05-02 18:20:31 3165 Central Park West & W 72 St 40.775794 -73.976206 3137 5 Ave & E 73 St 40.772828 -73.966853 20869 Customer NaN 0
99974 99974 243 2017-05-02 18:04:16 2017-05-02 18:08:20 3093 N 6 St & Bedford Ave 40.717452 -73.958509 3102 Driggs Ave & Lorimer St 40.721791 -73.950415 28112 Subscriber 1981.0 1
99975 99975 486 2017-05-02 18:04:16 2017-05-02 18:12:22 3443 W 52 St & 6 Ave 40.761330 -73.979820 305 E 58 St & 3 Ave 40.760958 -73.967245 20010 Subscriber 1988.0 1
99976 99976 125 2017-05-02 18:04:17 2017-05-02 18:06:23 2008 Little West St & 1 Pl 40.705693 -74.016777 2008 Little West St & 1 Pl 40.705693 -74.016777 29367 Customer NaN 0
99977 99977 432 2017-05-02 18:04:17 2017-05-02 18:11:30 3390 E 109 St & 3 Ave 40.793297 -73.943208 3312 1 Ave & E 94 St 40.781721 -73.945940 27005 Subscriber 1985.0 1
99978 99978 1980 2017-05-02 18:04:17 2017-05-02 18:37:17 440 E 45 St & 3 Ave 40.752554 -73.972826 426 West St & Chambers St 40.717548 -74.013221 27077 Subscriber 1981.0 2
99979 99979 1013 2017-05-02 18:04:19 2017-05-02 18:21:13 3057 Kosciuszko St & Tompkins Ave 40.691283 -73.945242 3102 Driggs Ave & Lorimer St 40.721791 -73.950415 27805 Subscriber 1992.0 2
99980 99980 2273 2017-05-02 18:04:19 2017-05-02 18:42:13 3002 South End Ave & Liberty St 40.711512 -74.015756 530 11 Ave & W 59 St 40.771522 -73.990541 17279 Subscriber 1955.0 2
99981 99981 353 2017-05-02 18:04:21 2017-05-02 18:10:14 252 MacDougal St & Washington Sq 40.732264 -73.998522 445 E 10 St & Avenue A 40.727408 -73.981420 27765 Subscriber 1992.0 1
99982 99982 1642 2017-05-02 18:04:22 2017-05-02 18:31:44 359 E 47 St & Park Ave 40.755103 -73.974987 3147 E 85 St & 3 Ave 40.778012 -73.954071 20809 Subscriber 1961.0 2
99983 99983 1402 2017-05-02 18:04:21 2017-05-02 18:27:44 3158 W 63 St & Broadway 40.771639 -73.982614 402 Broadway & E 22 St 40.740343 -73.989551 27577 Subscriber 1978.0 1
99984 99984 515 2017-05-02 18:04:22 2017-05-02 18:12:57 382 University Pl & E 14 St 40.734927 -73.992005 446 W 24 St & 7 Ave 40.744876 -73.995299 25648 Subscriber 1986.0 1
99985 99985 424 2017-05-02 18:04:23 2017-05-02 18:11:28 478 11 Ave & W 41 St 40.760301 -73.998842 529 W 42 St & 8 Ave 40.757570 -73.990985 27017 Subscriber 1968.0 1
99986 99986 816 2017-05-02 18:04:24 2017-05-02 18:18:01 3002 South End Ave & Liberty St 40.711512 -74.015756 358 Christopher St & Greenwich St 40.732916 -74.007114 17504 Subscriber 1971.0 2
99987 99987 1712 2017-05-02 18:04:33 2017-05-02 18:33:05 3137 5 Ave & E 73 St 40.772828 -73.966853 3282 5 Ave & E 88 St 40.783070 -73.959390 28558 Subscriber 1994.0 2
99988 99988 1194 2017-05-02 18:04:26 2017-05-02 18:24:21 253 W 13 St & 5 Ave 40.735439 -73.994539 3002 South End Ave & Liberty St 40.711512 -74.015756 25335 Subscriber 1974.0 1
99989 99989 598 2017-05-02 18:04:27 2017-05-02 18:14:25 268 Howard St & Centre St 40.719105 -73.999733 3002 South End Ave & Liberty St 40.711512 -74.015756 28662 Subscriber 1984.0 1
99990 99990 408 2017-05-02 18:04:27 2017-05-02 18:11:16 501 FDR Drive & E 35 St 40.744219 -73.971212 498 Broadway & W 32 St 40.748549 -73.988084 27307 Subscriber 1983.0 1
99991 99991 972 2017-05-02 18:04:26 2017-05-02 18:20:39 3158 W 63 St & Broadway 40.771639 -73.982614 490 8 Ave & W 33 St 40.751551 -73.993934 18888 Subscriber 1963.0 1
99992 99992 1345 2017-05-02 18:04:28 2017-05-02 18:26:54 494 W 26 St & 8 Ave 40.747348 -73.997236 457 Broadway & W 58 St 40.766953 -73.981693 28880 Subscriber 1980.0 2
99993 99993 1178 2017-05-02 18:04:29 2017-05-02 18:24:08 3178 Riverside Dr & W 78 St 40.784145 -73.983625 388 W 26 St & 10 Ave 40.749718 -74.002950 18646 Subscriber 1988.0 1
99994 99994 1582 2017-05-02 18:04:28 2017-05-02 18:30:51 3117 Franklin St & Dupont St 40.735640 -73.958660 414 Pearl St & Anchorage Pl 40.702819 -73.987658 28792 Subscriber 1976.0 1
99995 99995 2530 2017-05-02 18:04:29 2017-05-02 18:46:39 3169 Riverside Dr & W 82 St 40.787209 -73.981281 515 W 43 St & 10 Ave 40.760094 -73.994618 26463 Customer NaN 0
99996 99996 1760 2017-05-02 18:04:27 2017-05-02 18:33:48 382 University Pl & E 14 St 40.734927 -73.992005 497 E 17 St & Broadway 40.737050 -73.990093 29094 Subscriber 1987.0 1
99997 99997 944 2017-05-02 18:04:30 2017-05-02 18:20:15 402 Broadway & E 22 St 40.740343 -73.989551 524 W 43 St & 6 Ave 40.755273 -73.983169 28542 Subscriber 1987.0 2
99998 99998 352 2017-05-02 18:04:30 2017-05-02 18:10:22 498 Broadway & W 32 St 40.748549 -73.988084 442 W 27 St & 7 Ave 40.746647 -73.993915 16621 Subscriber 1989.0 1
99999 99999 227 2017-05-02 18:04:31 2017-05-02 18:08:18 3150 E 85 St & York Ave 40.775369 -73.948034 3305 E 91 St & 2 Ave 40.781122 -73.949656 28116 Subscriber 1969.0 1

100000 rows × 16 columns

def Q(sql):
    res= pd.read_sql_query(sql,db, chunksize=100_000)
    return next(res)
Q('SELECT * FROM tripdata')
index tripduration starttime stoptime start_station_id start_station_name start_station_latitude start_station_longitude end_station_id end_station_name end_station_latitude end_station_longitude bikeid usertype birth_year gender
0 0 254 2017-05-01 00:00:13 2017-05-01 00:04:27 511 E 14 St & Avenue B 40.729387 -73.977724 394 E 9 St & Avenue C 40.725213 -73.977688 27695 Subscriber 1996.0 2
1 1 248 2017-05-01 00:00:19 2017-05-01 00:04:28 511 E 14 St & Avenue B 40.729387 -73.977724 394 E 9 St & Avenue C 40.725213 -73.977688 15869 Subscriber 1996.0 1
2 2 1120 2017-05-01 00:00:19 2017-05-01 00:19:00 242 Carlton Ave & Flushing Ave 40.697787 -73.973736 3083 Bushwick Ave & Powers St 40.712477 -73.941000 18700 Subscriber 1985.0 2
3 3 212 2017-05-01 00:00:24 2017-05-01 00:03:56 168 W 18 St & 6 Ave 40.739713 -73.994564 116 W 17 St & 8 Ave 40.741776 -74.001497 24981 Subscriber 1993.0 1
4 4 686 2017-05-01 00:00:29 2017-05-01 00:11:55 494 W 26 St & 8 Ave 40.747348 -73.997236 527 E 33 St & 2 Ave 40.744023 -73.976056 25407 Subscriber 1964.0 1
5 5 577 2017-05-01 00:00:35 2017-05-01 00:10:12 334 W 20 St & 7 Ave 40.742388 -73.997262 504 1 Ave & E 16 St 40.732219 -73.981656 28713 Subscriber 1956.0 2
6 6 523 2017-05-01 00:00:41 2017-05-01 00:09:24 335 Washington Pl & Broadway 40.729039 -73.994046 487 E 20 St & FDR Drive 40.733143 -73.975739 15385 Subscriber 1994.0 1
7 7 419 2017-05-01 00:00:45 2017-05-01 00:07:44 336 Sullivan St & Washington Sq 40.730477 -73.999061 369 Washington Pl & 6 Ave 40.732241 -74.000264 18295 Customer NaN 0
8 8 518 2017-05-01 00:00:47 2017-05-01 00:09:25 335 Washington Pl & Broadway 40.729039 -73.994046 487 E 20 St & FDR Drive 40.733143 -73.975739 15608 Subscriber 1995.0 1
9 9 1296 2017-05-01 00:00:48 2017-05-01 00:22:25 291 Madison St & Montgomery St 40.713126 -73.984844 291 Madison St & Montgomery St 40.713126 -73.984844 17351 Subscriber 1994.0 1
10 10 418 2017-05-01 00:00:59 2017-05-01 00:07:58 336 Sullivan St & Washington Sq 40.730477 -73.999061 369 Washington Pl & 6 Ave 40.732241 -74.000264 28237 Customer NaN 0
11 11 589 2017-05-01 00:01:00 2017-05-01 00:10:50 415 Pearl St & Hanover Square 40.704718 -74.009260 340 Madison St & Clinton St 40.712690 -73.987763 15289 Subscriber 1952.0 1
12 12 735 2017-05-01 00:01:03 2017-05-01 00:13:18 330 Reade St & Broadway 40.714505 -74.005628 312 Allen St & Stanton St 40.722055 -73.989111 27270 Subscriber 1993.0 1
13 13 950 2017-05-01 00:01:00 2017-05-01 00:16:50 439 E 4 St & 2 Ave 40.726281 -73.989780 293 Lafayette St & E 8 St 40.730207 -73.991026 18740 Customer NaN 0
14 14 2205 2017-05-01 00:01:03 2017-05-01 00:37:49 306 Cliff St & Fulton St 40.708235 -74.005301 306 Cliff St & Fulton St 40.708235 -74.005301 17746 Subscriber 1986.0 1
15 15 391 2017-05-01 00:01:10 2017-05-01 00:07:41 3429 Hanson Pl & Ashland Pl 40.685068 -73.977908 419 Carlton Ave & Park Ave 40.695807 -73.973556 26118 Subscriber 1984.0 1
16 16 320 2017-05-01 00:01:11 2017-05-01 00:06:32 387 Centre St & Chambers St 40.712733 -74.004607 2008 Little West St & 1 Pl 40.705693 -74.016777 26691 Subscriber 1992.0 1
17 17 273 2017-05-01 00:01:11 2017-05-01 00:05:45 3090 N 8 St & Driggs Ave 40.717746 -73.956001 3111 Norman Ave & Leonard St - 2 40.725848 -73.950649 28209 Subscriber 1988.0 1
18 18 461 2017-05-01 00:01:40 2017-05-01 00:09:21 357 E 11 St & Broadway 40.732618 -73.991580 301 E 2 St & Avenue B 40.722174 -73.983688 27536 Subscriber 1994.0 1
19 19 345 2017-05-01 00:01:46 2017-05-01 00:07:32 350 Clinton St & Grand St 40.715595 -73.987030 511 E 14 St & Avenue B 40.729387 -73.977724 29347 Subscriber 1974.0 1
20 20 616 2017-05-01 00:02:01 2017-05-01 00:12:17 3074 Montrose Ave & Bushwick Ave 40.707678 -73.940162 3107 Bedford Ave & Nassau Ave 40.723117 -73.952123 27667 Subscriber 1980.0 2
21 21 552 2017-05-01 00:02:04 2017-05-01 00:11:16 446 W 24 St & 7 Ave 40.744876 -73.995299 368 Carmine St & 6 Ave 40.730386 -74.002150 28913 Subscriber 1985.0 1
22 22 628 2017-05-01 00:02:14 2017-05-01 00:12:42 434 9 Ave & W 18 St 40.743174 -74.003664 236 St Marks Pl & 2 Ave 40.728419 -73.987140 29346 Subscriber 1996.0 1
23 23 156 2017-05-01 00:02:18 2017-05-01 00:04:55 3428 8 Ave & W 16 St 40.740983 -74.001702 509 9 Ave & W 22 St 40.745497 -74.001971 27170 Subscriber 1981.0 1
24 24 305 2017-05-01 00:02:27 2017-05-01 00:07:33 504 1 Ave & E 16 St 40.732219 -73.981656 403 E 2 St & 2 Ave 40.725029 -73.990697 21322 Subscriber 1997.0 1
25 25 372 2017-05-01 00:02:47 2017-05-01 00:08:59 387 Centre St & Chambers St 40.712733 -74.004607 2008 Little West St & 1 Pl 40.705693 -74.016777 25564 Subscriber 1982.0 1
26 26 161 2017-05-01 00:02:54 2017-05-01 00:05:35 399 Lafayette Ave & St James Pl 40.688515 -73.964763 384 Fulton St & Washington Ave 40.683048 -73.964915 20545 Subscriber 1987.0 1
27 27 227 2017-05-01 00:02:55 2017-05-01 00:06:42 508 W 46 St & 11 Ave 40.763414 -73.996674 450 W 49 St & 8 Ave 40.762272 -73.987882 26622 Subscriber 1981.0 1
28 28 1636 2017-05-01 00:03:12 2017-05-01 00:30:29 477 W 41 St & 8 Ave 40.756405 -73.990026 410 Suffolk St & Stanton St 40.720664 -73.985180 24814 Subscriber 1962.0 2
29 29 1667 2017-05-01 00:03:16 2017-05-01 00:31:03 173 Broadway & W 49 St 40.760683 -73.984527 358 Christopher St & Greenwich St 40.732916 -74.007114 26887 Customer NaN 0
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
99970 99970 1606 2017-05-02 18:04:12 2017-05-02 18:30:59 168 W 18 St & 6 Ave 40.739713 -73.994564 360 William St & Pine St 40.707179 -74.008873 25697 Subscriber 1988.0 2
99971 99971 325 2017-05-02 18:04:15 2017-05-02 18:09:41 533 Broadway & W 39 St 40.752996 -73.987216 533 Broadway & W 39 St 40.752996 -73.987216 26634 Subscriber 1980.0 1
99972 99972 468 2017-05-02 18:04:23 2017-05-02 18:12:11 3440 Fulton St & Adams St 40.692418 -73.989495 467 Dean St & 4 Ave 40.683125 -73.978951 25152 Subscriber 1963.0 1
99973 99973 974 2017-05-02 18:04:16 2017-05-02 18:20:31 3165 Central Park West & W 72 St 40.775794 -73.976206 3137 5 Ave & E 73 St 40.772828 -73.966853 20869 Customer NaN 0
99974 99974 243 2017-05-02 18:04:16 2017-05-02 18:08:20 3093 N 6 St & Bedford Ave 40.717452 -73.958509 3102 Driggs Ave & Lorimer St 40.721791 -73.950415 28112 Subscriber 1981.0 1
99975 99975 486 2017-05-02 18:04:16 2017-05-02 18:12:22 3443 W 52 St & 6 Ave 40.761330 -73.979820 305 E 58 St & 3 Ave 40.760958 -73.967245 20010 Subscriber 1988.0 1
99976 99976 125 2017-05-02 18:04:17 2017-05-02 18:06:23 2008 Little West St & 1 Pl 40.705693 -74.016777 2008 Little West St & 1 Pl 40.705693 -74.016777 29367 Customer NaN 0
99977 99977 432 2017-05-02 18:04:17 2017-05-02 18:11:30 3390 E 109 St & 3 Ave 40.793297 -73.943208 3312 1 Ave & E 94 St 40.781721 -73.945940 27005 Subscriber 1985.0 1
99978 99978 1980 2017-05-02 18:04:17 2017-05-02 18:37:17 440 E 45 St & 3 Ave 40.752554 -73.972826 426 West St & Chambers St 40.717548 -74.013221 27077 Subscriber 1981.0 2
99979 99979 1013 2017-05-02 18:04:19 2017-05-02 18:21:13 3057 Kosciuszko St & Tompkins Ave 40.691283 -73.945242 3102 Driggs Ave & Lorimer St 40.721791 -73.950415 27805 Subscriber 1992.0 2
99980 99980 2273 2017-05-02 18:04:19 2017-05-02 18:42:13 3002 South End Ave & Liberty St 40.711512 -74.015756 530 11 Ave & W 59 St 40.771522 -73.990541 17279 Subscriber 1955.0 2
99981 99981 353 2017-05-02 18:04:21 2017-05-02 18:10:14 252 MacDougal St & Washington Sq 40.732264 -73.998522 445 E 10 St & Avenue A 40.727408 -73.981420 27765 Subscriber 1992.0 1
99982 99982 1642 2017-05-02 18:04:22 2017-05-02 18:31:44 359 E 47 St & Park Ave 40.755103 -73.974987 3147 E 85 St & 3 Ave 40.778012 -73.954071 20809 Subscriber 1961.0 2
99983 99983 1402 2017-05-02 18:04:21 2017-05-02 18:27:44 3158 W 63 St & Broadway 40.771639 -73.982614 402 Broadway & E 22 St 40.740343 -73.989551 27577 Subscriber 1978.0 1
99984 99984 515 2017-05-02 18:04:22 2017-05-02 18:12:57 382 University Pl & E 14 St 40.734927 -73.992005 446 W 24 St & 7 Ave 40.744876 -73.995299 25648 Subscriber 1986.0 1
99985 99985 424 2017-05-02 18:04:23 2017-05-02 18:11:28 478 11 Ave & W 41 St 40.760301 -73.998842 529 W 42 St & 8 Ave 40.757570 -73.990985 27017 Subscriber 1968.0 1
99986 99986 816 2017-05-02 18:04:24 2017-05-02 18:18:01 3002 South End Ave & Liberty St 40.711512 -74.015756 358 Christopher St & Greenwich St 40.732916 -74.007114 17504 Subscriber 1971.0 2
99987 99987 1712 2017-05-02 18:04:33 2017-05-02 18:33:05 3137 5 Ave & E 73 St 40.772828 -73.966853 3282 5 Ave & E 88 St 40.783070 -73.959390 28558 Subscriber 1994.0 2
99988 99988 1194 2017-05-02 18:04:26 2017-05-02 18:24:21 253 W 13 St & 5 Ave 40.735439 -73.994539 3002 South End Ave & Liberty St 40.711512 -74.015756 25335 Subscriber 1974.0 1
99989 99989 598 2017-05-02 18:04:27 2017-05-02 18:14:25 268 Howard St & Centre St 40.719105 -73.999733 3002 South End Ave & Liberty St 40.711512 -74.015756 28662 Subscriber 1984.0 1
99990 99990 408 2017-05-02 18:04:27 2017-05-02 18:11:16 501 FDR Drive & E 35 St 40.744219 -73.971212 498 Broadway & W 32 St 40.748549 -73.988084 27307 Subscriber 1983.0 1
99991 99991 972 2017-05-02 18:04:26 2017-05-02 18:20:39 3158 W 63 St & Broadway 40.771639 -73.982614 490 8 Ave & W 33 St 40.751551 -73.993934 18888 Subscriber 1963.0 1
99992 99992 1345 2017-05-02 18:04:28 2017-05-02 18:26:54 494 W 26 St & 8 Ave 40.747348 -73.997236 457 Broadway & W 58 St 40.766953 -73.981693 28880 Subscriber 1980.0 2
99993 99993 1178 2017-05-02 18:04:29 2017-05-02 18:24:08 3178 Riverside Dr & W 78 St 40.784145 -73.983625 388 W 26 St & 10 Ave 40.749718 -74.002950 18646 Subscriber 1988.0 1
99994 99994 1582 2017-05-02 18:04:28 2017-05-02 18:30:51 3117 Franklin St & Dupont St 40.735640 -73.958660 414 Pearl St & Anchorage Pl 40.702819 -73.987658 28792 Subscriber 1976.0 1
99995 99995 2530 2017-05-02 18:04:29 2017-05-02 18:46:39 3169 Riverside Dr & W 82 St 40.787209 -73.981281 515 W 43 St & 10 Ave 40.760094 -73.994618 26463 Customer NaN 0
99996 99996 1760 2017-05-02 18:04:27 2017-05-02 18:33:48 382 University Pl & E 14 St 40.734927 -73.992005 497 E 17 St & Broadway 40.737050 -73.990093 29094 Subscriber 1987.0 1
99997 99997 944 2017-05-02 18:04:30 2017-05-02 18:20:15 402 Broadway & E 22 St 40.740343 -73.989551 524 W 43 St & 6 Ave 40.755273 -73.983169 28542 Subscriber 1987.0 2
99998 99998 352 2017-05-02 18:04:30 2017-05-02 18:10:22 498 Broadway & W 32 St 40.748549 -73.988084 442 W 27 St & 7 Ave 40.746647 -73.993915 16621 Subscriber 1989.0 1
99999 99999 227 2017-05-02 18:04:31 2017-05-02 18:08:18 3150 E 85 St & York Ave 40.775369 -73.948034 3305 E 91 St & 2 Ave 40.781122 -73.949656 28116 Subscriber 1969.0 1

100000 rows × 16 columns

geneder query

res=Q('SELECT gender, count(*) from tripdata group by gender')
res
gender count(*)
0 0 1058729
1 1 5618129
2 2 2008199
res=Q( 'select usertype, gender,count(*) from tripdata group by gender, usertype' )
res
usertype gender count(*)
0 Customer 0 946624
1 Subscriber 0 112105
2 Customer 1 133632
3 Subscriber 1 5484497
4 Customer 2 78906
5 Subscriber 2 1929293
res = Q('''
SELECT
    (2020 - birth_year) as age,
    count(*)
from tripdata
group by age
''')
res
age count(*)
0 NaN 1026035
1 19.0 3111
2 20.0 13105
3 21.0 22640
4 22.0 35205
5 23.0 43961
6 24.0 61248
7 25.0 82996
8 26.0 145162
9 27.0 225350
10 28.0 275820
11 29.0 303805
12 30.0 329242
13 31.0 333745
14 32.0 336104
15 33.0 324443
16 34.0 320144
17 35.0 315356
18 36.0 290491
19 37.0 272767
20 38.0 254227
21 39.0 235317
22 40.0 213739
23 41.0 195974
24 42.0 178737
25 43.0 167309
26 44.0 164558
27 45.0 151740
28 46.0 153644
29 47.0 140617
... ... ...
73 91.0 99
74 92.0 69
75 93.0 114
76 94.0 95
77 97.0 532
78 99.0 56
79 100.0 8
80 102.0 100
81 103.0 133
82 104.0 7
83 105.0 17
84 107.0 34
85 108.0 137
86 110.0 193
87 113.0 17
88 115.0 3
89 119.0 224
90 120.0 921
91 121.0 165
92 124.0 21
93 125.0 76
94 126.0 32
95 127.0 105
96 130.0 5
97 131.0 3
98 132.0 21
99 133.0 48
100 134.0 65
101 135.0 245
102 146.0 6

103 rows × 2 columns

res = Q('''
SELECT
    (2020 - birth_year) as age,
    count(*)
from tripdata
group by age
having age >1
''')
res
age count(*)
0 19.0 3111
1 20.0 13105
2 21.0 22640
3 22.0 35205
4 23.0 43961
5 24.0 61248
6 25.0 82996
7 26.0 145162
8 27.0 225350
9 28.0 275820
10 29.0 303805
11 30.0 329242
12 31.0 333745
13 32.0 336104
14 33.0 324443
15 34.0 320144
16 35.0 315356
17 36.0 290491
18 37.0 272767
19 38.0 254227
20 39.0 235317
21 40.0 213739
22 41.0 195974
23 42.0 178737
24 43.0 167309
25 44.0 164558
26 45.0 151740
27 46.0 153644
28 47.0 140617
29 48.0 138148
... ... ...
72 91.0 99
73 92.0 69
74 93.0 114
75 94.0 95
76 97.0 532
77 99.0 56
78 100.0 8
79 102.0 100
80 103.0 133
81 104.0 7
82 105.0 17
83 107.0 34
84 108.0 137
85 110.0 193
86 113.0 17
87 115.0 3
88 119.0 224
89 120.0 921
90 121.0 165
91 124.0 21
92 125.0 76
93 126.0 32
94 127.0 105
95 130.0 5
96 131.0 3
97 132.0 21
98 133.0 48
99 134.0 65
100 135.0 245
101 146.0 6

102 rows × 2 columns

res = Q('''
SELECT bikeid, sum(tripduration)
FROM tripdata
GROUP BY bikeid
ORDER BY 2
''')
res
bikeid sum(tripduration)
0 31358 93
1 31789 157
2 29838 209
3 26258 274
4 31941 304
5 29301 325
6 30405 353
7 30829 451
8 30402 467
9 29843 593
10 30294 642
11 29610 667
12 25231 716
13 31107 741
14 24357 773
15 30339 773
16 30050 877
17 29644 939
18 31807 956
19 29637 1086
20 30157 1265
21 29836 1273
22 29253 1315
23 29501 1326
24 29845 1457
25 29824 1460
26 29217 1498
27 31784 1530
28 31782 1563
29 31827 1593
... ... ...
12148 16666 3874348
12149 18811 3875045
12150 21308 3910883
12151 20541 4071344
12152 25707 4115980
12153 28014 4191835
12154 20596 4225521
12155 17515 4236450
12156 26484 4237832
12157 29755 4330469
12158 15462 4374386
12159 24843 4382533
12160 25360 4500421
12161 25945 4507709
12162 27892 4517035
12163 26591 4517352
12164 17428 4531047
12165 26970 4610802
12166 28873 4644748
12167 26975 4682062
12168 17799 4709650
12169 28653 4781705
12170 20145 4811001
12171 17928 5115114
12172 15062 5189426
12173 25505 5218382
12174 21683 5386738
12175 26502 5867954
12176 21173 6164703
12177 16811 6223765

12178 rows × 2 columns

res = Q('''
SELECT bikeid, sum(tripduration)/3600 as trip_duration_hours
FROM tripdata
GROUP BY bikeid
ORDER BY 2
''')
res
bikeid trip_duration_hours
0 19312 0
1 22380 0
2 24357 0
3 24747 0
4 25231 0
5 26152 0
6 26216 0
7 26258 0
8 27275 0
9 29217 0
10 29253 0
11 29273 0
12 29301 0
13 29484 0
14 29501 0
15 29513 0
16 29594 0
17 29610 0
18 29619 0
19 29637 0
20 29638 0
21 29644 0
22 29650 0
23 29673 0
24 29705 0
25 29824 0
26 29831 0
27 29836 0
28 29838 0
29 29843 0
... ... ...
12148 16666 1076
12149 18811 1076
12150 21308 1086
12151 20541 1130
12152 25707 1143
12153 28014 1164
12154 20596 1173
12155 17515 1176
12156 26484 1177
12157 29755 1202
12158 15462 1215
12159 24843 1217
12160 25360 1250
12161 25945 1252
12162 26591 1254
12163 27892 1254
12164 17428 1258
12165 26970 1280
12166 28873 1290
12167 26975 1300
12168 17799 1308
12169 28653 1328
12170 20145 1336
12171 17928 1420
12172 15062 1441
12173 25505 1449
12174 21683 1496
12175 26502 1629
12176 21173 1712
12177 16811 1728

12178 rows × 2 columns

Q('''
SELECT gender, avg(tripduration)
FROM tripdata
Group by gender

''')
gender avg(tripduration)
0 0 1879.283770
1 1 906.374779
2 2 1039.558765
Q('''
SELECT bikeid, sum(tripduration) as total
FROM tripdata
group by bikeid

''')
bikeid total
0 14529 893894
1 14530 632416
2 14531 694481
3 14532 511588
4 14533 734408
5 14534 657774
6 14535 829025
7 14536 736906
8 14537 643763
9 14539 765586
10 14540 437563
11 14541 800923
12 14544 785139
13 14545 959844
14 14547 624715
15 14548 603874
16 14549 631430
17 14550 520644
18 14551 1608483
19 14552 608622
20 14553 757367
21 14555 638516
22 14556 575367
23 14558 549563
24 14559 796470
25 14561 661922
26 14562 512684
27 14563 588310
28 14565 533523
29 14566 738766
... ... ...
12148 31941 304
12149 31942 11269
12150 31944 23005
12151 31947 11386
12152 31950 20951
12153 31951 3361
12154 31953 9665
12155 31955 19491
12156 31958 18938
12157 31959 14588
12158 31960 12513
12159 31961 4597
12160 31962 14790
12161 31963 16857
12162 31964 18890
12163 31965 4726
12164 31966 11343
12165 31967 7416
12166 31968 15542
12167 31969 9512
12168 31970 17205
12169 31971 15581
12170 31972 7674
12171 31973 7402
12172 31974 9893
12173 31975 18640
12174 31976 14094
12175 31977 13542
12176 31978 2474
12177 31979 18413

12178 rows × 2 columns

Q('''
select hour_bucket, count(*) from (
SELECT bikeid, sum(tripduration)/3600 as hours, round(sum(tripduration)/3600 /100)*100 hour_bucket
FROM tripdata
group by bikeid
)
group by hour_bucket
''')
hour_bucket count(*)
0 0.0 1725
1 100.0 4336
2 200.0 4686
3 300.0 935
4 400.0 221
5 500.0 102
6 600.0 65
7 700.0 30
8 800.0 29
9 900.0 14
10 1000.0 8
11 1100.0 6
12 1200.0 10
13 1300.0 4
14 1400.0 4
15 1600.0 1
16 1700.0 2
Q('''

SELECT starttime
from tripdata
limit 3


'''
)
starttime
0 2017-05-01 00:00:13
1 2017-05-01 00:00:19
2 2017-05-01 00:00:19
Q('''

SELECT starttime,
substr(starttime,1,10)
from tripdata
limit 3


'''
)
starttime substr(starttime,1,10)
0 2017-05-01 00:00:13 2017-05-01
1 2017-05-01 00:00:19 2017-05-01
2 2017-05-01 00:00:19 2017-05-01
Q('''

SELECT starttime,
substr(starttime,1,10) as d,
substr(starttime,11,10) as t
from tripdata
limit 3


'''
)
starttime d t
0 2017-05-01 00:00:13 2017-05-01 00:00:13
1 2017-05-01 00:00:19 2017-05-01 00:00:19
2 2017-05-01 00:00:19 2017-05-01 00:00:19
Q('''

SELECT starttime,
substr(starttime,1,10) as d,
substr(starttime,1,4) as y,
substr(starttime,6,2) as m,
substr(starttime,9,2) as d,

substr(starttime,11,10) as t
from tripdata
limit 3


'''
)
starttime d y m d t
0 2017-05-01 00:00:13 2017-05-01 2017 05 01 00:00:13
1 2017-05-01 00:00:19 2017-05-01 2017 05 01 00:00:19
2 2017-05-01 00:00:19 2017-05-01 2017 05 01 00:00:19
Q('SELECT bikeid, substr(starttime, 6 , 2) as m, sum(tripduration)/3600 as total from tripdata group by bikeid,m')
bikeid m total
0 14529 05 34
1 14529 06 21
2 14529 07 128
3 14529 08 31
4 14529 09 31
5 14530 05 34
6 14530 06 39
7 14530 07 30
8 14530 08 31
9 14530 09 39
10 14531 05 35
11 14531 06 34
12 14531 07 45
13 14531 08 32
14 14531 09 45
15 14532 05 16
16 14532 06 40
17 14532 07 20
18 14532 08 33
19 14532 09 31
20 14533 05 12
21 14533 06 47
22 14533 07 48
23 14533 08 47
24 14533 09 49
25 14534 05 30
26 14534 06 51
27 14534 07 44
28 14534 08 12
29 14534 09 43
... ... ... ...
52849 31941 09 0
52850 31942 09 3
52851 31944 09 6
52852 31947 09 3
52853 31950 09 5
52854 31951 09 0
52855 31953 09 2
52856 31955 09 5
52857 31958 09 5
52858 31959 09 4
52859 31960 09 3
52860 31961 09 1
52861 31962 09 4
52862 31963 09 4
52863 31964 09 5
52864 31965 09 1
52865 31966 09 3
52866 31967 09 2
52867 31968 09 4
52868 31969 09 2
52869 31970 09 4
52870 31971 09 4
52871 31972 09 2
52872 31973 09 2
52873 31974 09 2
52874 31975 09 5
52875 31976 09 3
52876 31977 09 3
52877 31978 09 0
52878 31979 09 5

52879 rows × 3 columns

Q('''
SELECT
    bikeid,
    sum(case when m = '05' then total end)as may,
    sum(case when m = '06' then total end)as jun,
    sum(case when m = '07' then total end)as jul,
    sum(case when m = '08' then total end)as aug,
    sum(case when m = '09' then total end)as sep
FROM
(
    SELECT
        bikeid, substr(starttime, 6 , 2) as m,
        sum(tripduration)/3600 as total
    from
        tripdata
    group by
        bikeid,
        m
)
group by bikeid
''')
bikeid may jun jul aug sep
0 14529 34.0 21.0 128.0 31.0 31.0
1 14530 34.0 39.0 30.0 31.0 39.0
2 14531 35.0 34.0 45.0 32.0 45.0
3 14532 16.0 40.0 20.0 33.0 31.0
4 14533 12.0 47.0 48.0 47.0 49.0
5 14534 30.0 51.0 44.0 12.0 43.0
6 14535 42.0 57.0 43.0 43.0 42.0
7 14536 21.0 68.0 53.0 35.0 25.0
8 14537 19.0 40.0 42.0 38.0 38.0
9 14539 38.0 35.0 54.0 51.0 32.0
10 14540 20.0 NaN 12.0 50.0 37.0
11 14541 38.0 44.0 40.0 43.0 55.0
12 14544 49.0 59.0 49.0 27.0 31.0
13 14545 70.0 50.0 40.0 56.0 48.0
14 14547 30.0 4.0 33.0 57.0 48.0
15 14548 34.0 56.0 23.0 13.0 39.0
16 14549 21.0 27.0 56.0 28.0 40.0
17 14550 41.0 23.0 36.0 14.0 29.0
18 14551 22.0 40.0 311.0 31.0 40.0
19 14552 34.0 23.0 32.0 40.0 37.0
20 14553 33.0 37.0 44.0 52.0 41.0
21 14555 52.0 49.0 47.0 6.0 21.0
22 14556 13.0 24.0 56.0 36.0 29.0
23 14558 20.0 46.0 11.0 33.0 41.0
24 14559 43.0 48.0 54.0 40.0 33.0
25 14561 53.0 28.0 38.0 44.0 18.0
26 14562 21.0 37.0 40.0 29.0 13.0
27 14563 39.0 26.0 18.0 25.0 53.0
28 14565 22.0 27.0 22.0 35.0 40.0
29 14566 30.0 47.0 44.0 45.0 37.0
... ... ... ... ... ... ...
12148 31941 NaN NaN NaN NaN 0.0
12149 31942 NaN NaN NaN NaN 3.0
12150 31944 NaN NaN NaN NaN 6.0
12151 31947 NaN NaN NaN NaN 3.0
12152 31950 NaN NaN NaN NaN 5.0
12153 31951 NaN NaN NaN NaN 0.0
12154 31953 NaN NaN NaN NaN 2.0
12155 31955 NaN NaN NaN NaN 5.0
12156 31958 NaN NaN NaN NaN 5.0
12157 31959 NaN NaN NaN NaN 4.0
12158 31960 NaN NaN NaN NaN 3.0
12159 31961 NaN NaN NaN NaN 1.0
12160 31962 NaN NaN NaN NaN 4.0
12161 31963 NaN NaN NaN NaN 4.0
12162 31964 NaN NaN NaN NaN 5.0
12163 31965 NaN NaN NaN NaN 1.0
12164 31966 NaN NaN NaN NaN 3.0
12165 31967 NaN NaN NaN NaN 2.0
12166 31968 NaN NaN NaN NaN 4.0
12167 31969 NaN NaN NaN NaN 2.0
12168 31970 NaN NaN NaN NaN 4.0
12169 31971 NaN NaN NaN NaN 4.0
12170 31972 NaN NaN NaN NaN 2.0
12171 31973 NaN NaN NaN NaN 2.0
12172 31974 NaN NaN NaN NaN 2.0
12173 31975 NaN NaN NaN NaN 5.0
12174 31976 NaN NaN NaN NaN 3.0
12175 31977 NaN NaN NaN NaN 3.0
12176 31978 NaN NaN NaN NaN 0.0
12177 31979 NaN NaN NaN NaN 5.0

12178 rows × 6 columns

Q('''
select
    start_station_name,
    sum(case when gender = 1 then 1 else 0 end)as male,
    sum(case when gender = 2 then 2 else 0 end)as female
from tripdata
group by 1

''')
start_station_name male female
0 1 Ave & E 110 St 193 200
1 1 Ave & E 16 St 22764 17910
2 1 Ave & E 18 St 16518 12152
3 1 Ave & E 30 St 15321 10304
4 1 Ave & E 44 St 8654 6836
5 1 Ave & E 62 St 13905 9716
6 1 Ave & E 68 St 19418 14474
7 1 Ave & E 78 St 12751 9924
8 1 Ave & E 94 St 4070 3406
9 1 Pl & Clinton St 2385 2538
10 10 Hudson Yards 1569 1040
11 10 St & 5 Ave 2088 1952
12 10 St & 7 Ave 3064 2856
13 11 Ave & W 27 St 20088 14972
14 11 Ave & W 41 St 17370 9942
15 11 Ave & W 59 St 16723 11244
16 11 St & 35 Ave 7 8
17 12 Ave & W 40 St 26897 23924
18 12 St & 4 Ave 1490 1090
19 14 St & 5 Ave 2916 3016
20 14 St & 7 Ave 3351 2734
21 2 Ave & E 104 St 3003 1946
22 2 Ave & 36 St - Citi Bike HQ at Industry City 993 930
23 2 Ave & 9 St 1344 806
24 2 Ave & E 122 St 51 26
25 2 Ave & E 31 St 14612 9932
26 2 Ave & E 96 St 6948 5100
27 2 Ave & E 99 St 1904 1092
28 21 St & 31 Dr 3 4
29 21 St & 36 Ave 10 4
... ... ... ...
680 W 90 St & Amsterdam Ave 3919 3398
681 W 92 St & Broadway 5987 5006
682 W 95 St & Broadway 7164 5982
683 W Broadway & Spring St 11765 10224
684 W Broadway & Spring Street 3381 3032
685 Warren St & Church St 9341 6404
686 Warren St & Court St 4011 3854
687 Washington Ave & Greene Ave 2932 2790
688 Washington Ave & Park Ave 4085 3430
689 Washington Park 3616 3730
690 Washington Pl & 6 Ave 10689 7972
691 Washington Pl & Broadway 13054 8016
692 Washington St & Gansevoort St 16815 15268
693 Water - Whitehall Plaza 6126 4066
694 Watts St & Greenwich St 13572 9586
695 West Drive & Prospect Park West 5570 6770
696 West End Ave & W 107 St 3090 3104
697 West End Ave & W 94 St 5133 5016
698 West St & Chambers St 34146 35194
699 West Thames St 13502 12856
700 William St & Pine St 7941 5060
701 Willoughby Ave & Hall St 4714 5074
702 Willoughby Ave & Tompkins Ave 1423 1582
703 Willoughby Ave & Walworth St 1854 1234
704 Willoughby St & Fleet St 7210 5786
705 Wolcott St & Dwight St 807 760
706 Wyckoff St & 3 Ave 2285 2512
707 Wythe Ave & Metropolitan Ave 10631 10188
708 Yankee Ferry Terminal 2203 2818
709 York St & Jay St 9932 9494

710 rows × 3 columns

Q('''
select
    start_station_name,
    100.0 * male /(male+female) as pct_male
from

(
    select
        start_station_name,
        sum(case when gender = 1 then 1 else 0 end)as male,
        sum(case when gender = 2 then 2 else 0 end)as female
    from tripdata
    group by 1
)
''')
start_station_name pct_male
0 1 Ave & E 110 St 49.109415
1 1 Ave & E 16 St 55.966957
2 1 Ave & E 18 St 57.614231
3 1 Ave & E 30 St 59.789268
4 1 Ave & E 44 St 55.868302
5 1 Ave & E 62 St 58.867110
6 1 Ave & E 68 St 57.293757
7 1 Ave & E 78 St 56.233738
8 1 Ave & E 94 St 54.440877
9 1 Pl & Clinton St 48.446069
10 10 Hudson Yards 60.137984
11 10 St & 5 Ave 51.683168
12 10 St & 7 Ave 51.756757
13 11 Ave & W 27 St 57.296064
14 11 Ave & W 41 St 63.598418
15 11 Ave & W 59 St 59.795473
16 11 St & 35 Ave 46.666667
17 12 Ave & W 40 St 52.924972
18 12 St & 4 Ave 57.751938
19 14 St & 5 Ave 49.157114
20 14 St & 7 Ave 55.069844
21 2 Ave & E 104 St 60.678925
22 2 Ave & 36 St - Citi Bike HQ at Industry City 51.638066
23 2 Ave & 9 St 62.511628
24 2 Ave & E 122 St 66.233766
25 2 Ave & E 31 St 59.533898
26 2 Ave & E 96 St 57.669323
27 2 Ave & E 99 St 63.551402
28 21 St & 31 Dr 42.857143
29 21 St & 36 Ave 71.428571
... ... ...
680 W 90 St & Amsterdam Ave 53.560202
681 W 92 St & Broadway 54.461930
682 W 95 St & Broadway 54.495664
683 W Broadway & Spring St 53.504025
684 W Broadway & Spring Street 52.721035
685 Warren St & Church St 59.326770
686 Warren St & Court St 50.998093
687 Washington Ave & Greene Ave 51.240825
688 Washington Ave & Park Ave 54.357951
689 Washington Park 49.224068
690 Washington Pl & 6 Ave 57.279889
691 Washington Pl & Broadway 61.955387
692 Washington St & Gansevoort St 52.410934
693 Water - Whitehall Plaza 60.105965
694 Watts St & Greenwich St 58.606097
695 West Drive & Prospect Park West 45.137763
696 West End Ave & W 107 St 49.886987
697 West End Ave & W 94 St 50.576411
698 West St & Chambers St 49.244303
699 West Thames St 51.225434
700 William St & Pine St 61.079917
701 Willoughby Ave & Hall St 48.161013
702 Willoughby Ave & Tompkins Ave 47.354409
703 Willoughby Ave & Walworth St 60.038860
704 Willoughby St & Fleet St 55.478609
705 Wolcott St & Dwight St 51.499681
706 Wyckoff St & 3 Ave 47.633938
707 Wythe Ave & Metropolitan Ave 51.063932
708 Yankee Ferry Terminal 43.875722
709 York St & Jay St 51.127355

710 rows × 2 columns

Q('''
    select
        start_station_name,
        round((2020 - birth_year)/5)*5 as age,
        substr(starttime,11,2) as h,
        sum(case when gender = 1 then 1 else 0 end)as male,
        sum(case when gender = 2 then 2 else 0 end)as female
    from tripdata
    group by 1,2,3

''')
start_station_name age h male female
0 1 Ave & E 110 St NaN 0 0 0
1 1 Ave & E 110 St NaN 1 0 0
2 1 Ave & E 110 St NaN 2 0 0
3 1 Ave & E 110 St 25.0 0 6 2
4 1 Ave & E 110 St 25.0 1 6 6
5 1 Ave & E 110 St 25.0 2 6 0
6 1 Ave & E 110 St 30.0 0 2 6
7 1 Ave & E 110 St 30.0 1 10 40
8 1 Ave & E 110 St 30.0 2 4 2
9 1 Ave & E 110 St 35.0 0 22 6
10 1 Ave & E 110 St 35.0 1 24 28
11 1 Ave & E 110 St 35.0 2 3 0
12 1 Ave & E 110 St 40.0 0 28 0
13 1 Ave & E 110 St 40.0 1 14 36
14 1 Ave & E 110 St 40.0 2 8 2
15 1 Ave & E 110 St 45.0 0 13 6
16 1 Ave & E 110 St 45.0 1 17 8
17 1 Ave & E 110 St 45.0 2 3 0
18 1 Ave & E 110 St 50.0 0 2 0
19 1 Ave & E 110 St 50.0 1 2 0
20 1 Ave & E 110 St 50.0 2 0 2
21 1 Ave & E 110 St 55.0 0 1 32
22 1 Ave & E 110 St 55.0 1 9 16
23 1 Ave & E 110 St 60.0 1 6 2
24 1 Ave & E 110 St 65.0 0 0 4
25 1 Ave & E 110 St 65.0 1 5 0
26 1 Ave & E 110 St 65.0 2 1 0
27 1 Ave & E 110 St 75.0 0 0 2
28 1 Ave & E 110 St 75.0 1 1 0
29 1 Ave & E 16 St NaN 0 0 0
... ... ... ... ... ...
27659 York St & Jay St 30.0 1 1388 1592
27660 York St & Jay St 30.0 2 254 228
27661 York St & Jay St 35.0 0 389 322
27662 York St & Jay St 35.0 1 1755 1970
27663 York St & Jay St 35.0 2 244 244
27664 York St & Jay St 40.0 0 294 252
27665 York St & Jay St 40.0 1 1219 1190
27666 York St & Jay St 40.0 2 199 126
27667 York St & Jay St 45.0 0 220 192
27668 York St & Jay St 45.0 1 805 552
27669 York St & Jay St 45.0 2 119 72
27670 York St & Jay St 50.0 0 105 162
27671 York St & Jay St 50.0 1 846 342
27672 York St & Jay St 50.0 2 115 36
27673 York St & Jay St 55.0 0 94 60
27674 York St & Jay St 55.0 1 437 254
27675 York St & Jay St 55.0 2 66 56
27676 York St & Jay St 60.0 0 21 36
27677 York St & Jay St 60.0 1 176 262
27678 York St & Jay St 60.0 2 32 20
27679 York St & Jay St 65.0 0 32 22
27680 York St & Jay St 65.0 1 99 114
27681 York St & Jay St 65.0 2 11 10
27682 York St & Jay St 70.0 0 6 6
27683 York St & Jay St 70.0 1 56 46
27684 York St & Jay St 70.0 2 4 6
27685 York St & Jay St 75.0 0 3 0
27686 York St & Jay St 75.0 1 4 4
27687 York St & Jay St 80.0 1 4 0
27688 York St & Jay St 80.0 2 2 0

27689 rows × 5 columns

res = Q('select count(*) from tripdata')
res
count(*)
0 8685057

Creating a clean dataset

db.execute('''
CREATE TABLE
    tripdata_clean as
SELECT
    (2020- birth_year) as age,
    CASE WHEN gender = 0 then 'X'
         WHEN gender = 1 then 'Male'
         WHEN gender = 2 then 'Female' end as sex,

    *
FROM tripdata
WHERE age > 0
    AND age < 80
    AND tripduration <6000

''')
<sqlite3.Cursor at 0x29822a0af10>
Q('select count(*) from tripdata_clean')
count(*)
0 7630014

Plotting:

%pylab inline
Populating the interactive namespace from numpy and matplotlib
import seaborn as sb
res = Q('''

select age, count(*) as c from tripdata_clean group by age


''')
hist(res.age, weights=res.c, bins= len(res) )
(array([  3091.,  13052.,  22426.,  34788.,  43686.,  60885.,  82500.,
        144783., 224798., 275293., 303208., 328422., 333107., 335369.,
        321919., 319496., 314728., 289934., 272027., 253770., 234885.,
        213225., 195550., 178354., 166852., 164220., 151432., 153265.,
        140317., 137885., 145172., 153601., 142248., 126760., 116005.,
        109145., 112818., 107669., 102578., 100664.,  86778.,  87151.,
         76582.,  68865.,  63716.,  53088.,  44981.,  43182.,  38850.,
         26466.,  23917.,  17193.,  14196.,  12097.,  12677.,   9279.,
          5376.,   4472.,   3260.,   4688.,   3273.]),
 array([19.        , 19.98360656, 20.96721311, 21.95081967, 22.93442623,
        23.91803279, 24.90163934, 25.8852459 , 26.86885246, 27.85245902,
        28.83606557, 29.81967213, 30.80327869, 31.78688525, 32.7704918 ,
        33.75409836, 34.73770492, 35.72131148, 36.70491803, 37.68852459,
        38.67213115, 39.6557377 , 40.63934426, 41.62295082, 42.60655738,
        43.59016393, 44.57377049, 45.55737705, 46.54098361, 47.52459016,
        48.50819672, 49.49180328, 50.47540984, 51.45901639, 52.44262295,
        53.42622951, 54.40983607, 55.39344262, 56.37704918, 57.36065574,
        58.3442623 , 59.32786885, 60.31147541, 61.29508197, 62.27868852,
        63.26229508, 64.24590164, 65.2295082 , 66.21311475, 67.19672131,
        68.18032787, 69.16393443, 70.14754098, 71.13114754, 72.1147541 ,
        73.09836066, 74.08196721, 75.06557377, 76.04918033, 77.03278689,
        78.01639344, 79.        ]),
 <a list of 61 Patch objects>)

linearly separable data

res = Q('''
select
    age,
    sum(case when sex = 'Female' then 1 end) as 'Female',
    sum(case when sex = 'Male' then 1 end) as 'Male'
from
    tripdata_clean
group by 1


''')
res.head()
age Female Male
0 19.0 488 2520
1 20.0 2245 10693
2 21.0 5102 17286
3 22.0 7509 26574
4 23.0 11465 32083
figure(figsize = (5,12))
barh(res.age, res.Female/res.Female.sum(), color ='r', label = 'Female')
barh(res.age,-res.Male/res.Male.sum(), color='b', label = 'Male')
title('age distribution')
ylabel('age')
grid()
legend()
<matplotlib.legend.Legend at 0x29800abec48>

linearly separable data

res = Q('''
select
    age,
    sum(case when usertype = 'Subscriber' then 1 end ) as Subscriber,
    sum(case when usertype != 'Subscriber' then 1 end ) as Customer
from
    tripdata_clean
group by
    1

''')

res.head()
age Subscriber Customer
0 19.0 3089 2
1 20.0 13051 1
2 21.0 20178 2248
3 22.0 30122 4666
4 23.0 39579 4107
figure(figsize = (5,12))
barh(res.age, res.Subscriber/res.Subscriber.sum(), color ='y', label = 'Subscriber')
barh(res.age,-res.Customer/res.Customer.sum(), color='g', label = 'Customer')
title('age distribution')
ylabel('age')
grid()
legend()
<matplotlib.legend.Legend at 0x2980f0ca8c8>

linearly separable data

res = Q('''
select
    tripduration / 60 as 'duration',
    sum(case when sex = 'Female' then 1 end) as 'Female',
    sum(case when sex = 'Male' then 1 end) as 'Male'
from
    tripdata_clean
group by
    1
''')

res.head()
duration Female Male
0 1 17006 85222
1 2 45796 213363
2 3 78325 325656
3 4 104389 393120
4 5 120663 414420
figure(figsize = (5,12))
barh(res.duration, res.Female/res.Female.sum(), color ='r', label = 'Female')
barh(res.duration,-res.Male/res.Male.sum(), color='b', label = 'Male')
title('duration distribution')
ylabel('duration[in minutes]')
grid()
legend()
ylim(0,50)
(0, 50)

linearly separable data

res = Q('''
select
    start_station_name,
    start_station_latitude as lat,
    start_station_longitude as long,
    count(*) as c
from
    tripdata_clean
group by
    1,2,3

''')
res.head()
start_station_name lat long c
0 1 Ave & E 110 St 40.792327 -73.938300 291
1 1 Ave & E 16 St 40.732219 -73.981656 31789
2 1 Ave & E 18 St 40.733812 -73.980544 22571
3 1 Ave & E 30 St 40.741444 -73.975361 20550
4 1 Ave & E 44 St 40.750020 -73.969053 12099
hist(res.lat, range=(40.6, 40.9), bins= 100);

linearly separable data

hist(res.long, bins= 100, range = (-74.1,-73.8));

linearly separable data

plot(res.long, res.lat, '.')
xlim(-80,-70)
ylim(40,43)
(40, 43)

linearly separable data

plot(res.long, res.lat, '.')
xlim(-74.5,-73.5)
ylim(40.5,41)
(40.5, 41)

linearly separable data

plot(res.long, res.lat, '.')
xlim(-74.1,-73.8)
ylim(40.6,40.85)
(40.6, 40.85)

linearly separable data

figure(figsize =(10,10))
xstart=-74.1
ystart=40.6
extent=.25
plot(res.long, res.lat, '.')
xlim(xstart,xstart+extent)
ylim(ystart,ystart+extent)
(40.6, 40.85)

linearly separable data

figure(figsize =(10,10))
xstart=-74.05
ystart=40.645
extent=.18
plot(res.long, res.lat, 'o', ms=5)
xlim(xstart,xstart+extent)
ylim(ystart,ystart+extent)
(40.645, 40.825)

linearly separable data

figure(figsize =(10,10))
xstart=-74.05
ystart=40.645
extent=.18
scatter(res.long, res.lat, 35*res.c/res.c.max())
xlim(xstart,xstart+extent)
ylim(ystart,ystart+extent)
(40.645, 40.825)

linearly separable data

figure(figsize =(10,10))
xstart=-74.05
ystart=40.645
extent=.18
scatter(res.long, res.lat, c= 35*res.c/res.c.max(), cmap = 'rainbow')
xlim(xstart,xstart+extent)
ylim(ystart,ystart+extent)
colorbar()
<matplotlib.colorbar.Colorbar at 0x298307f1b88>

linearly separable data

def my_plot(x,y,s,c,max_size=50):
    figure(figsize =(10,10))
    xstart=-74.05
    ystart=40.645
    extent=.18
    scatter(x, y, s= max_size*s/max(s), c=c, cmap = 'rainbow')
    xlim(xstart,xstart+extent)
    ylim(ystart,ystart+extent)
    colorbar()
my_plot(res.long , res.lat,res.c, res.c)

linearly separable data

res = Q('''
select
    start_station_name,
    start_station_latitude as lat,
    start_station_longitude as long,
    avg(tripduration) as d,
    sum(case when sex = 'Female' then 1 end) as 'Female',
    sum(case when sex = 'Male' then 1 end) as 'Male'
from
    tripdata_clean
group by
    1,2,3

''')

res.head()
start_station_name lat long d Female Male
0 1 Ave & E 110 St 40.792327 -73.938300 868.518900 99.0 192
1 1 Ave & E 16 St 40.732219 -73.981656 665.284658 8934.0 22702
2 1 Ave & E 18 St 40.733812 -73.980544 678.556112 6063.0 16447
3 1 Ave & E 30 St 40.741444 -73.975361 734.345791 5146.0 15300
4 1 Ave & E 44 St 40.750020 -73.969053 857.510621 3412.0 8644
my_plot(res.long, res.lat, res.d,100*res.Male/(res.Male+res.Female))

linearly separable data

res = Q('''
select
    start_station_name,
    start_station_latitude as lat,
    start_station_longitude as long,
    avg(case when sex = 'Female' then age end) as 'Female',
    avg(case when sex = 'Male' then age end) as 'Male'
from
    tripdata_clean
group by
    1,2,3

''')

my_plot(res.long, res.lat, res.Female*0+1, res.Female)

linearly separable data