Hosted with nbsanity. See source notebook on GitHub.

Experiments with duckdb

14 04 24

Conclusions: duckdb is really fast

import duckdb as db
import pandas as pd
import polars as pl

import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

sns.set()
%matplotlib inline

Datasets

  • GNAF
  • Uber
filename_gnaf = 'gnaf_feb2024/GNAF_CORE.parquet'
filename_uber = 'nyc_taxi/*.parquet'

GNAF: Queries

query = f"""
select STREET_NAME, count(STREET_NAME) freq 
from '{filename_gnaf}'
where STATE='NSW' group by STREET_NAME order by freq desc"""

db.sql(query).df()
STREET_NAME freq
0 PACIFIC 33668
1 VICTORIA 26291
2 GEORGE 24173
3 PRINCES 18595
4 CHURCH 17183
... ... ...
40765 MOLOWRAN 1
40766 MCGRUERS 1
40767 LODGE ROAD 1
40768 ROCKVILLE 1
40769 FAINTS 1

40770 rows × 2 columns

query = f"""
with nsw_addresses as (
    select STREET_NAME[1:1] start_letter
    from '{filename_gnaf}'
    where STATE='NSW' 
    )
select start_letter, count(start_letter) freq from nsw_addresses
group by start_letter order by freq desc"""

db.sql(query).df()
start_letter freq
0 B 500267
1 C 467428
2 M 426049
3 S 326787
4 P 299393
5 W 296203
6 H 258910
7 G 250268
8 A 220864
9 R 217310
10 T 213359
11 L 196466
12 D 181450
13 K 167153
14 F 161510
15 E 143570
16 N 121485
17 O 119007
18 J 86372
19 V 68137
20 I 37394
21 Y 36899
22 Q 26382
23 U 22352
24 Z 2993
25 X 123
26 4 21

Uber: Queries

query = f"""
select hour(tpep_pickup_datetime) from '{filename_uber}'
limit 5
"""
db.sql(query).df()
hour(tpep_pickup_datetime)
0 0
1 0
2 0
3 0
4 0
db.sql(f"select count(*) from '{filename_uber}' limit 5").df() 
count_star()
0 1397171103
query = f"""
with trip_times as (
    select dayname(tpep_pickup_datetime) pickup_day, hour(tpep_pickup_datetime) pickup_hour
    from '{filename_uber}'
)
select pickup_hour, count(pickup_hour) num_trips
from trip_times
group by pickup_hour
"""
db.sql(query).df()
pickup_hour num_trips
0 0 50678732
1 1 36877987
2 2 26957035
3 3 19555023
4 4 14525425
5 5 13591974
6 6 29403425
7 7 50361361
8 8 62541557
9 9 64666650
10 10 64080897
11 11 66721920
12 12 70508042
13 13 70561027
14 14 73439139
15 15 71081360
16 16 63335055
17 17 73969642
18 18 86322162
19 19 86792821
20 20 80625722
21 21 79247415
22 22 75871810
23 23 65454922
query = f"""
with trip_times as (
    select dayname(tpep_pickup_datetime) pickup_day, hour(tpep_pickup_datetime) pickup_hour
    from '{filename_uber}'
)
select pickup_day, pickup_hour, count(*) num_trips
from trip_times
group by pickup_day, pickup_hour
"""
results = db.sql(query).df()

plt.figure(figsize=(6, 4))
sns.lineplot(data=results, x='pickup_hour', y='num_trips', hue='pickup_day')
plt.show()