DuckDB 概览
curl https://install.duckdb.org | sh
pip install duckdb
install.packages("duckdb")
<dependency>
<groupId>org.duckdb</groupId>
<artifactId>duckdb_jdbc</artifactId>
<version>1.3.2.0</version>
</dependency>
npm install @duckdb/node-api
curl https://install.duckdb.org | sh
curl https://install.duckdb.org | sh
cargo add duckdb --features bundled
go get github.com/marcboeker/go-duckdb
-- Get the top-3 busiest train stations
SELECT
station_name,
count(*) AS num_services
FROM train_services
GROUP BY ALL
ORDER BY num_services DESC
LIMIT 3;
-- Load CSV file to a table. DuckDB auto-detects
-- the CSV's format, column name and types
CREATE TABLE stations AS
FROM 's3://duckdb-blobs/stations.csv';
-- Directly query Parquet file in S3
SELECT
station_name,
count(*) AS num_services
FROM 's3://duckdb-blobs/train_services.parquet'
GROUP BY ALL
ORDER BY num_services DESC
LIMIT 10;
-- Find the top-3 longest domestic train routes
SELECT s1.name_short, s2.name_short, d.distance
FROM distances d
JOIN stations s1 ON d.station1 = s1.code
JOIN stations s2 ON d.station2 = s2.code
WHERE s1.country = s2.country
AND s1.code < s2.code
ORDER BY distance DESC
LIMIT 3;
-- List the closest IC stations (as the crow flies)
SELECT
s1.name_long AS station1,
s2.name_long AS station2,
ST_Distance(
ST_Point(s1.geo_lng, s1.geo_lat),
ST_Point(s2.geo_lng, s2.geo_lat)
) * 111_139 AS distance
FROM stations s1, stations s2
WHERE s1.type LIKE '%Intercity%'
AND s2.type LIKE '%Intercity%'
AND s1.id < s2.id
ORDER BY distance ASC
LIMIT 3;
# Get the top-3 busiest train stations
import duckdb
duckdb.sql("""
SELECT station, count(*) AS num_services
FROM train_services
GROUP BY ALL
ORDER BY num_services DESC
LIMIT 3;
""")
# Reading and writing Pandas dataframes
import pandas as pd
import duckdb
df_in = pd.DataFrame({
'station': ['Delft', 'Delft', 'Gouda', 'Gouda'],
'day': ['Mon', 'Tue', 'Mon', 'Tue'],
'num_services' : [22, 20, 27, 25]})
# Run query on a dataframe and return a dataframe
df_out = duckdb.sql("""
SELECT station, sum(num_services)
FROM df_in
GROUP BY station
""").to_df()
# Create custom user-defined function
import duckdb
def plus_one(x):
return x + 1
con = duckdb.connect()
con.create_function('plus_one', plus_one,
['BIGINT'], 'BIGINT', type='native')
con.sql("""
SELECT sum(plus_one(i)) FROM range(10) tbl(i);
""")
# Find the largest sepals/petals in the Iris data set
library(duckdb)
con <- dbConnect(duckdb())
duckdb_register(con, "iris", iris)
query <- r'(
SELECT count(*) AS num_observations,
max("Sepal.Width") AS max_width,
max("Petal.Length") AS max_petal_length
FROM iris
WHERE "Sepal.Length" > 5
GROUP BY ALL
)'
dbGetQuery(con, query)
# Find the largest sepals/petals in the Iris data set
# using duckplyr
library("duckplyr")
iris |>
filter(Sepal.Length > 5) |>
group_by(Species) |>
summarize(
num_observations = n(),
max_width = max(Sepal.Width),
max_petal_length = max(Petal.Length),
na.rm = TRUE) |>
collect()
# Find the largest sepals/petals in the Iris data set
# using dplyr
library("duckdb")
library("dplyr")
con <- dbConnect(duckdb())
duckdb_register(con, "iris", iris)
tbl(con, "iris") |>
filter(Sepal.Length > 5) |>
group_by(Species) |>
summarize(
num_observations = count(),
max_width = max(Sepal.Width),
max_petal_length = max(Petal.Length),
na.rm = TRUE) |>
collect()
// Get a list of train stations by traffic
Connection conn =
DriverManager.getConnection("jdbc:duckdb:");
Statement st = conn.createStatement();
ResultSet rs = st.executeQuery(
"SELECT station_name,\n" +
" count(*) AS num_services\n" +
"FROM train_services\n" +
"GROUP BY ALL\n" +
"ORDER BY num_services DESC;");
System.out.println(rs.next());
// Perform bulk inserts using the Appender API
DuckDBConnection conn = (DuckDBConnection)
DriverManager.getConnection("jdbc:duckdb:");
Statement st = conn.createStatement();
st.execute("CREATE TABLE person " +
"(name VARCHAR, age INT)");
var appender = conn.createAppender(
DuckDBConnection.DEFAULT_SCHEMA, "person");
appender.beginRow();
appender.append("MC Ducky");
appender.append(49);
appender.endRow();
appender.close();
// Get the top-3 busiest train stations in May
import { DuckDBInstance } from '@duckdb/node-api';
const instance = await DuckDBInstance.create();
const connection = await instance.connect();
const reader = await connection.runAndReadAll(
`SELECT station_name, count(*) AS num_services
FROM
'http://blobs.duckdb.org/train_services.parquet'
WHERE monthname(date) = 'May'
GROUP BY ALL
ORDER BY num_services DESC
LIMIT 3;`
);
console.table(reader.getRows());
// Web Service Integration:
// Create endpoint to generate numbers
import express from "express";
import { DuckDBInstance } from '@duckdb/node-api';
const app = express();
const instance = await DuckDBInstance.create();
const connection = await instance.connect();
app.get("/getnumbers", async (req, res) => {
const reader = await connection.runAndReadAll(
"SELECT random() AS num FROM range(10)");
res.end(JSON.stringify(reader.getRows()));
});
app.listen(8082, () => console.log(
"Go to: https://:8082/getnumbers"));