import duckdb as dbimport pandas as pdimport polars as plimport numpy as npimport matplotlib.pyplot as pltimport seaborn as snssns.set()%matplotlib inline
query =f"""select STREET_NAME, count(STREET_NAME) freq from '{filename_gnaf}'where STATE='NSW' group by STREET_NAME order by freq desc"""db.sql(query).df()
STREET_NAME
freq
0
PACIFIC
33668
1
VICTORIA
26291
2
GEORGE
24173
3
PRINCES
18595
4
CHURCH
17183
...
...
...
40765
MOLOWRAN
1
40766
MCGRUERS
1
40767
LODGE ROAD
1
40768
ROCKVILLE
1
40769
FAINTS
1
40770 rows × 2 columns
query =f"""with nsw_addresses as ( select STREET_NAME[1:1] start_letter from '{filename_gnaf}' where STATE='NSW' )select start_letter, count(start_letter) freq from nsw_addressesgroup by start_letter order by freq desc"""db.sql(query).df()
start_letter
freq
0
B
500267
1
C
467428
2
M
426049
3
S
326787
4
P
299393
5
W
296203
6
H
258910
7
G
250268
8
A
220864
9
R
217310
10
T
213359
11
L
196466
12
D
181450
13
K
167153
14
F
161510
15
E
143570
16
N
121485
17
O
119007
18
J
86372
19
V
68137
20
I
37394
21
Y
36899
22
Q
26382
23
U
22352
24
Z
2993
25
X
123
26
4
21
Uber: Queries
query =f"""select hour(tpep_pickup_datetime) from '{filename_uber}'limit 5"""db.sql(query).df()
hour(tpep_pickup_datetime)
0
0
1
0
2
0
3
0
4
0
db.sql(f"select count(*) from '{filename_uber}' limit 5").df()
count_star()
0
1397171103
query =f"""with trip_times as ( select dayname(tpep_pickup_datetime) pickup_day, hour(tpep_pickup_datetime) pickup_hour from '{filename_uber}')select pickup_hour, count(pickup_hour) num_tripsfrom trip_timesgroup by pickup_hour"""db.sql(query).df()
pickup_hour
num_trips
0
0
50678732
1
1
36877987
2
2
26957035
3
3
19555023
4
4
14525425
5
5
13591974
6
6
29403425
7
7
50361361
8
8
62541557
9
9
64666650
10
10
64080897
11
11
66721920
12
12
70508042
13
13
70561027
14
14
73439139
15
15
71081360
16
16
63335055
17
17
73969642
18
18
86322162
19
19
86792821
20
20
80625722
21
21
79247415
22
22
75871810
23
23
65454922
query =f"""with trip_times as ( select dayname(tpep_pickup_datetime) pickup_day, hour(tpep_pickup_datetime) pickup_hour from '{filename_uber}')select pickup_day, pickup_hour, count(*) num_tripsfrom trip_timesgroup by pickup_day, pickup_hour"""results = db.sql(query).df()plt.figure(figsize=(6, 4))sns.lineplot(data=results, x='pickup_hour', y='num_trips', hue='pickup_day')plt.show()