From duckdb to st_to_sf

R
duckdb
arrow
sf
geoarrow
How to convert a duckdb extraction into an sf object
Author

Nicolas Chuche

Published

July 12, 2025

Until recently, generating an SF dataframe from a duckdb query required:

  1. Using ST_AsWKB or ST_AsText on the geometry column
  2. Materializing the data to transfer it to sf::st_as_sf

With recent versions of duckdb, the spatial extension, and the geoarrow package, you can now ask duckdb to produce data that can be directly reused by geoarrow:

library(geoarrow)
library(duckdb)
library(sf)

con <- dbConnect(duckdb())

url <- "https://static.data.gouv.fr/resources/sirene-geolocalise-parquet/20240107-143656/sirene2024-geo.parquet"

x <- dbExecute(con, "LOAD spatial;")
x <- dbExecute(con, "LOAD httpfs;")
1x <- dbExecute(con, "CALL register_geoarrow_extensions()")

dplyr::tbl(con, dplyr::sql(glue::glue("SELECT geometry 
                                       FROM read_parquet('{url}')
2                                       LIMIT 5"))) |>
3  arrow::to_arrow() |>
4  st_as_sf(crs=st_crs(2154))
1
Instructs duckdb spatial to add geoarrow metadata to geometry-type columns
2
Thanks to the previous command, this line will return geometries readable by geoarrow
3
This line converts the object into an arrow object
4
geoarrow overrides the st_as_sf function so it can directly read the arrow object
Simple feature collection with 5 features and 0 fields
Geometry type: POINT
Dimension:     XY
Bounding box:  xmin: 3.735375 ymin: 49.38698 xmax: 3.738175 ymax: 49.39506
Projected CRS: RGF93 v1 / Lambert-93
                   geometry
1 POINT (3.738175 49.39245)
2 POINT (3.735375 49.38829)
3 POINT (3.735446 49.39507)
4 POINT (3.738132 49.38698)
5 POINT (3.735748 49.38712)

A quick comparison

And it’s much faster than all other methods:

Show me the benchmark code
library(arrow)
library(duckdb)
library(sf)
library(dplyr)
library(glue)
library(timemoir)
library(geoarrow)

sample_size <- 1e8

if (!file.exists("geo.parquet")) {
  download.file("https://static.data.gouv.fr/resources/sirene-geolocalise-parquet/20240107-143656/sirene2024-geo.parquet", "geo.parquet")
}

with_register_geoarrow <- function() {
  conn_ddb <- dbConnect(duckdb())
  dbExecute(conn_ddb, "LOAD spatial;")
  dbExecute(conn_ddb, "CALL register_geoarrow_extensions()")
  
  query <- dplyr::tbl(conn_ddb, sql(glue("SELECT * FROM read_parquet('geo.parquet') LIMIT {sample_size}"))) |>
    arrow::to_arrow() |>
    st_as_sf(crs=st_crs(2154))
  
  dbDisconnect(conn_ddb, shutdown = TRUE)
}

with_st_read <- function() {
  conn_ddb <- dbConnect(duckdb())
  on.exit(dbDisconnect(conn_ddb, shutdown = TRUE))
  dbExecute(conn_ddb, "LOAD spatial;")
  
  a <- st_read(
    conn_ddb, 
    query=glue(
      "SELECT * REPLACE(geometry.ST_ASWKB() AS geometry) FROM read_parquet('geo.parquet') 
      WHERE geometry IS NOT NULL LIMIT {sample_size}"
    ), 
    geometry_column = "geometry") |>
    st_set_crs(2154)
  dbDisconnect(conn_ddb, shutdown = TRUE)
}

with_get_query_aswkb <- function() {
  conn_ddb <- dbConnect(duckdb())
  on.exit(dbDisconnect(conn_ddb, shutdown = TRUE))
  dbExecute(conn_ddb, "LOAD spatial;")
  
  query <- dbGetQuery(
    conn_ddb, 
    glue(
      "
      SELECT * REPLACE(geometry.ST_AsWKB() AS geometry) FROM read_parquet('geo.parquet') 
      WHERE geometry IS NOT NULL LIMIT {sample_size}
      "
    )
  ) |>
    sf::st_as_sf(crs = st_crs(2154))
  dbDisconnect(conn_ddb, shutdown = TRUE)
}

with_get_query_astxt <- function() {
  conn_ddb <- dbConnect(duckdb())
  on.exit(dbDisconnect(conn_ddb, shutdown = TRUE))
  dbExecute(conn_ddb, "LOAD spatial;")
  
  query <- dbGetQuery(
    conn_ddb, 
    glue(
      "
      SELECT * REPLACE(geometry.ST_AsText() AS geometry) FROM read_parquet('geo.parquet')
      WHERE geometry IS NOT NULL LIMIT {sample_size}
      "
    )
  ) |>
    sf::st_as_sf(wkt = "geometry", crs = st_crs(2154))
}
res <- timemoir(
  with_register_geoarrow(), 
  with_st_read(),
  with_get_query_aswkb(),
  with_get_query_astxt())
res |>
  kableExtra::kable()
fname duration error start_mem max_mem cpu_user cpu_sys
with_register_geoarrow() 102.676 NA 256252 26895240 85.800 16.457
with_st_read() 552.807 NA 257016 25266148 496.485 54.894
with_get_query_aswkb() 593.295 NA 285444 25291172 552.240 77.263
with_get_query_astxt() 450.991 NA 286640 24853872 426.793 63.129
plot(res)

Back to top