Until recently, generating an SF dataframe from a duckdb query required:
Using ST_AsWKB
or ST_AsText
on the geometry column
Materializing the data to transfer it to sf::st_as_sf
With recent versions of duckdb, the spatial extension, and the geoarrow package, you can now ask duckdb to produce data that can be directly reused by geoarrow
:
library (geoarrow)
library (duckdb)
library (sf)
con <- dbConnect (duckdb ())
url <- "https://static.data.gouv.fr/resources/sirene-geolocalise-parquet/20240107-143656/sirene2024-geo.parquet"
x <- dbExecute (con, "LOAD spatial;" )
x <- dbExecute (con, "LOAD httpfs;" )
1 x <- dbExecute (con, "CALL register_geoarrow_extensions()" )
dplyr:: tbl (con, dplyr:: sql (glue:: glue ("SELECT geometry
FROM read_parquet('{url}')
2 LIMIT 5" ))) |>
3 arrow:: to_arrow () |>
4 st_as_sf (crs= st_crs (2154 ))
1
Instructs duckdb spatial to add geoarrow metadata to geometry-type columns
2
Thanks to the previous command, this line will return geometries readable by geoarrow
3
This line converts the object into an arrow object
4
geoarrow overrides the st_as_sf
function so it can directly read the arrow object
Simple feature collection with 5 features and 0 fields
Geometry type: POINT
Dimension: XY
Bounding box: xmin: 3.735375 ymin: 49.38698 xmax: 3.738175 ymax: 49.39506
Projected CRS: RGF93 v1 / Lambert-93
geometry
1 POINT (3.738175 49.39245)
2 POINT (3.735375 49.38829)
3 POINT (3.735446 49.39507)
4 POINT (3.738132 49.38698)
5 POINT (3.735748 49.38712)
A quick comparison
And it’s much faster than all other methods:
Show me the benchmark code library (arrow)
library (duckdb)
library (sf)
library (dplyr)
library (glue)
library (timemoir)
library (geoarrow)
sample_size <- 1e8
if (! file.exists ("geo.parquet" )) {
download.file ("https://static.data.gouv.fr/resources/sirene-geolocalise-parquet/20240107-143656/sirene2024-geo.parquet" , "geo.parquet" )
}
with_register_geoarrow <- function () {
conn_ddb <- dbConnect (duckdb ())
dbExecute (conn_ddb, "LOAD spatial;" )
dbExecute (conn_ddb, "CALL register_geoarrow_extensions()" )
query <- dplyr:: tbl (conn_ddb, sql (glue ("SELECT * FROM read_parquet('geo.parquet') LIMIT {sample_size}" ))) |>
arrow:: to_arrow () |>
st_as_sf (crs= st_crs (2154 ))
dbDisconnect (conn_ddb, shutdown = TRUE )
}
with_st_read <- function () {
conn_ddb <- dbConnect (duckdb ())
on.exit (dbDisconnect (conn_ddb, shutdown = TRUE ))
dbExecute (conn_ddb, "LOAD spatial;" )
a <- st_read (
conn_ddb,
query= glue (
"SELECT * REPLACE(geometry.ST_ASWKB() AS geometry) FROM read_parquet('geo.parquet')
WHERE geometry IS NOT NULL LIMIT {sample_size}"
),
geometry_column = "geometry" ) |>
st_set_crs (2154 )
dbDisconnect (conn_ddb, shutdown = TRUE )
}
with_get_query_aswkb <- function () {
conn_ddb <- dbConnect (duckdb ())
on.exit (dbDisconnect (conn_ddb, shutdown = TRUE ))
dbExecute (conn_ddb, "LOAD spatial;" )
query <- dbGetQuery (
conn_ddb,
glue (
"
SELECT * REPLACE(geometry.ST_AsWKB() AS geometry) FROM read_parquet('geo.parquet')
WHERE geometry IS NOT NULL LIMIT {sample_size}
"
)
) |>
sf:: st_as_sf (crs = st_crs (2154 ))
dbDisconnect (conn_ddb, shutdown = TRUE )
}
with_get_query_astxt <- function () {
conn_ddb <- dbConnect (duckdb ())
on.exit (dbDisconnect (conn_ddb, shutdown = TRUE ))
dbExecute (conn_ddb, "LOAD spatial;" )
query <- dbGetQuery (
conn_ddb,
glue (
"
SELECT * REPLACE(geometry.ST_AsText() AS geometry) FROM read_parquet('geo.parquet')
WHERE geometry IS NOT NULL LIMIT {sample_size}
"
)
) |>
sf:: st_as_sf (wkt = "geometry" , crs = st_crs (2154 ))
}
res <- timemoir (
with_register_geoarrow (),
with_st_read (),
with_get_query_aswkb (),
with_get_query_astxt ())
res |>
kableExtra:: kable ()
with_register_geoarrow()
102.676
NA
256252
26895240
85.800
16.457
with_st_read()
552.807
NA
257016
25266148
496.485
54.894
with_get_query_aswkb()
593.295
NA
285444
25291172
552.240
77.263
with_get_query_astxt()
450.991
NA
286640
24853872
426.793
63.129
Some useful links
There isn’t much documentation about this command:
devtools:: session_info (pkgs = "attached" )
─ Session info ───────────────────────────────────────────────────────────────
setting value
version R version 4.5.0 (2025-04-11)
os Ubuntu 22.04.5 LTS
system x86_64, linux-gnu
ui X11
language (EN)
collate en_US.UTF-8
ctype en_US.UTF-8
tz Etc/UTC
date 2025-08-09
pandoc 3.7.0.2 @ /usr/bin/ (via rmarkdown)
quarto 1.7.31 @ /usr/local/bin/quarto
─ Packages ───────────────────────────────────────────────────────────────────
package * version date (UTC) lib source
arrow * 20.0.0.2 2025-05-26 [1] RSPM (R 4.5.0)
DBI * 1.2.3 2024-06-02 [1] RSPM (R 4.5.0)
dplyr * 1.1.4 2023-11-17 [1] RSPM (R 4.5.0)
duckdb * 1.3.0 2025-06-02 [1] RSPM (R 4.5.0)
geoarrow * 0.3.0 2025-05-26 [1] RSPM (R 4.5.0)
glue * 1.8.0 2024-09-30 [1] RSPM (R 4.5.0)
sf * 1.0-21 2025-05-15 [1] RSPM (R 4.5.0)
timemoir * 0.8.0.9000 2025-08-09 [1] Github (nbc/timemoir@646734a)
[1] /usr/local/lib/R/site-library
[2] /usr/local/lib/R/library
* ── Packages attached to the search path.
──────────────────────────────────────────────────────────────────────────────
Back to top