From bfffb4e9e46786cba2a30a487b65d5b19c0c5e7a Mon Sep 17 00:00:00 2001 From: Svilen Mihaylov <172523102+svilen-mihaylov-db@users.noreply.github.com> Date: Sun, 19 Apr 2026 18:56:04 -0400 Subject: [PATCH 1/2] Add NostromoDB to JSONbench --- nostromodb/benchmark.sh | 30 ++++++++++++ nostromodb/count.sh | 14 ++++++ nostromodb/create_and_load.sh | 21 ++++++++ nostromodb/drop_table.sh | 14 ++++++ nostromodb/install.sh | 5 ++ nostromodb/load_data.sh | 40 ++++++++++++++++ nostromodb/main.sh | 77 ++++++++++++++++++++++++++++++ nostromodb/physical_query_plans.sh | 25 ++++++++++ nostromodb/queries.sql | 5 ++ nostromodb/queries_formatted.sql | 64 +++++++++++++++++++++++++ nostromodb/query_results.sh | 25 ++++++++++ nostromodb/run_queries.sh | 26 ++++++++++ nostromodb/run_statements.sh | 15 ++++++ nostromodb/total_size.sh | 12 +++++ nostromodb/uninstall.sh | 6 +++ 15 files changed, 379 insertions(+) create mode 100755 nostromodb/benchmark.sh create mode 100755 nostromodb/count.sh create mode 100755 nostromodb/create_and_load.sh create mode 100755 nostromodb/drop_table.sh create mode 100755 nostromodb/install.sh create mode 100755 nostromodb/load_data.sh create mode 100755 nostromodb/main.sh create mode 100755 nostromodb/physical_query_plans.sh create mode 100644 nostromodb/queries.sql create mode 100644 nostromodb/queries_formatted.sql create mode 100755 nostromodb/query_results.sh create mode 100755 nostromodb/run_queries.sh create mode 100755 nostromodb/run_statements.sh create mode 100755 nostromodb/total_size.sh create mode 100755 nostromodb/uninstall.sh diff --git a/nostromodb/benchmark.sh b/nostromodb/benchmark.sh new file mode 100755 index 0000000..2191129 --- /dev/null +++ b/nostromodb/benchmark.sh @@ -0,0 +1,30 @@ +#!/bin/bash + +# Check if the required arguments are provided +if [[ $# -lt 2 ]]; then + echo "Usage: $0 [RESULT_FILE]" + exit 1 +fi + +# Arguments +DATA_DIR="$1" +DATABASE_PATH="$2" +RESULT_FILE="${3:-}" + +# Print the database name +echo "Running queries on database: $DATABASE_PATH" + +# Run queries and log the output +./run_queries.sh "$DATA_DIR" "$DATABASE_PATH" 2>&1 | tee query_log.txt + +# Process the query log and prepare the result +RESULT=$(cat query_log.txt | grep -oP 'Real time: \d+\.\d+ seconds' | sed -r -e 's/Real time: ([0-9]+\.[0-9]+) seconds/\1/' | \ +awk '{ if (i % 3 == 0) { printf "[" }; printf $1; if (i % 3 != 2) { printf "," } else { print "]," }; ++i; }') + +# Output the result +if [[ -n "$RESULT_FILE" ]]; then + echo "$RESULT" > "$RESULT_FILE" + echo "Result written to $RESULT_FILE" +else + echo "$RESULT" +fi diff --git a/nostromodb/count.sh b/nostromodb/count.sh new file mode 100755 index 0000000..92d3587 --- /dev/null +++ b/nostromodb/count.sh @@ -0,0 +1,14 @@ +#!/bin/bash + +# Check if the required arguments are provided +if [[ $# -lt 3 ]]; then + echo "Usage: $0 " + exit 1 +fi + +# Arguments +DATA_PATH="$1" +DATABASE_PATH="$2" +TABLE_NAME="$3" + +./run_statements.sh "$DATA_PATH" "$DATABASE_PATH" "select count(*) from $TABLE_NAME;" diff --git a/nostromodb/create_and_load.sh b/nostromodb/create_and_load.sh new file mode 100755 index 0000000..d48e948 --- /dev/null +++ b/nostromodb/create_and_load.sh @@ -0,0 +1,21 @@ +#!/bin/bash + +# Check if the required arguments are provided +if [[ $# -lt 4 ]]; then + echo "Usage: $0 " + exit 1 +fi + +# Arguments +DB_PATH="$1" +TABLE_NAME="$2" +DATA_DIRECTORY="$3" +NUM_FILES="$4" + +# Validate arguments +[[ ! -d "$DATA_DIRECTORY" ]] && { echo "Error in create_and_load: Data directory '$DATA_DIRECTORY' does not exist."; exit 1; } +[[ ! "$NUM_FILES" =~ ^[0-9]+$ ]] && { echo "Error in create_and_load: NUM_FILES must be a positive integer."; exit 1; } + +# No need to issue explicit 'create table', just load the data... +echo "Load data" +./load_data.sh "$DATA_DIRECTORY" "$DB_PATH" "$TABLE_NAME" "$NUM_FILES" diff --git a/nostromodb/drop_table.sh b/nostromodb/drop_table.sh new file mode 100755 index 0000000..47ddb08 --- /dev/null +++ b/nostromodb/drop_table.sh @@ -0,0 +1,14 @@ +#!/bin/bash + +# Check if the required arguments are provided +if [[ $# -lt 1 ]]; then + echo "Usage: $0 " + exit 1 +fi + +# Arguments +DATABASE_PATH="$1" + +echo "Dropping database: $DATABASE_PATH" + +rm -rf "${DATABASE_PATH}" diff --git a/nostromodb/install.sh b/nostromodb/install.sh new file mode 100755 index 0000000..e72354c --- /dev/null +++ b/nostromodb/install.sh @@ -0,0 +1,5 @@ +#!/bin/bash + +sudo snap install docker +sudo apt-get update +docker pull svilenmihaylov/nostromodb:latest diff --git a/nostromodb/load_data.sh b/nostromodb/load_data.sh new file mode 100755 index 0000000..0d4d598 --- /dev/null +++ b/nostromodb/load_data.sh @@ -0,0 +1,40 @@ +#!/bin/bash + +# Check if the required arguments are provided +if [[ $# -lt 4 ]]; then + echo "Usage: $0 " + exit 1 +fi + +# Arguments +DATA_DIR="$1" +DB_PATH="$2" +TABLE_NAME="$3" +MAX_FILES="$4" + +# Validate that MAX_FILES is a number +if ! [[ "$MAX_FILES" =~ ^[0-9]+$ ]]; then + echo "Error: must be a positive integer." + exit 1 +fi + +counter=0 + +# Loop through each .json.gz file in the directory +for file in $(ls "$DATA_DIR"/*.json.gz | sort); do + echo "Progress:" $counter "of" $MAX_FILES "files loaded" + + if [[ -f "$file" ]]; then + base_name=$(basename ${file}) + ./run_statements.sh "$DATA_DIR" "$DB_PATH" "import from '/data/$base_name' into $TABLE_NAME options {'has_top_level_array': false}" + counter=$((counter + 1)) + fi + + # Stop processing if the max number of files is reached + if [[ $counter -ge $MAX_FILES ]]; then + echo "Copied maximum number of files: $MAX_FILES" + break + fi +done + + ./run_statements.sh "$DATA_DIR" "$DB_PATH" "pragma table_flush('$TABLE_NAME')" diff --git a/nostromodb/main.sh b/nostromodb/main.sh new file mode 100755 index 0000000..1c15435 --- /dev/null +++ b/nostromodb/main.sh @@ -0,0 +1,77 @@ +#!/bin/bash + +DEFAULT_CHOICE=ask +DEFAULT_DATA_DIRECTORY=~/data/bluesky + +# Allow the user to optionally provide the scale factor ("choice") as an argument +CHOICE="${1:-$DEFAULT_CHOICE}" + +# Allow the user to optionally provide the data directory as an argument +DATA_DIRECTORY="${2:-$DEFAULT_DATA_DIRECTORY}" + +# Define prefix for output files +OUTPUT_PREFIX="${5:-_m6i.8xlarge}" + +# Check if the directory exists +if [[ ! -d "$DATA_DIRECTORY" ]]; then + echo "Error in main: Data directory '$DATA_DIRECTORY' does not exist." + exit 1 +fi + +if [ "$CHOICE" = "ask" ]; then + echo "Select the dataset size to benchmark:" + echo "1) 1m (default)" + echo "2) 10m" + echo "3) 100m" + echo "4) 1000m" + echo "5) all" + read -p "Enter the number corresponding to your choice: " CHOICE +fi + +./install.sh + +benchmark() { + local size=$1 + # Check DATA_DIRECTORY contains the required number of files to run the benchmark + file_count=$(find "$DATA_DIRECTORY" -type f | wc -l) + if (( file_count < size )); then + echo "Error: Not enough files in '$DATA_DIRECTORY'. Required: $size, Found: $file_count." + exit 1 + fi + + local DB_DIR=~/data/"nostromodb_jsonbench_${size}" + rm -rf "${DB_DIR}" + mkdir -p "${DB_DIR}" + + ./create_and_load.sh "$DB_DIR" bluesky "$DATA_DIRECTORY" "$size" + ./total_size.sh "$DB_DIR" | tee "${OUTPUT_PREFIX}_bluesky_${size}m.data_size" + ./count.sh "$DATA_DIRECTORY" "$DB_DIR" bluesky | tee "${OUTPUT_PREFIX}_bluesky_${size}m.count" + ./query_results.sh "$DATA_DIRECTORY" "$DB_DIR" | tee "${OUTPUT_PREFIX}_bluesky_${size}m.query_results" + ./physical_query_plans.sh "$DATA_DIRECTORY" "$DB_DIR" | tee "${OUTPUT_PREFIX}_bluesky_${size}m.physical_query_plans" + ./benchmark.sh "$DATA_DIRECTORY" "$DB_DIR" "${OUTPUT_PREFIX}_bluesky_${size}m.results_runtime" + ./drop_table.sh "$DB_DIR" +} + +case $CHOICE in + 2) + benchmark 10 + ;; + 3) + benchmark 100 + ;; + 4) + benchmark 1000 + ;; + 5) + benchmark 1 + benchmark 10 + benchmark 100 + benchmark 1000 + ;; + *) + benchmark 1 + ;; +esac + + +./uninstall.sh diff --git a/nostromodb/physical_query_plans.sh b/nostromodb/physical_query_plans.sh new file mode 100755 index 0000000..d90f14c --- /dev/null +++ b/nostromodb/physical_query_plans.sh @@ -0,0 +1,25 @@ +#!/bin/bash + +# Check if the required arguments are provided +if [[ $# -lt 2 ]]; then + echo "Usage: $0 " + exit 1 +fi + +# Arguments +DATA_DIR="$1" +DATABASE_PATH="$2" + +QUERY_NUM=1 + +cat queries.sql | while read -r query; do + # Print the query number + echo "------------------------------------------------------------------------------------------------------------------------" + echo "Physical query plan for query Q$QUERY_NUM:" + echo + + ./run_statements.sh "$DATA_DIR" "$DATABASE_PATH" "EXPLAIN $query" + + # Increment the query number + QUERY_NUM=$((QUERY_NUM + 1)) +done; diff --git a/nostromodb/queries.sql b/nostromodb/queries.sql new file mode 100644 index 0000000..c1dcee5 --- /dev/null +++ b/nostromodb/queries.sql @@ -0,0 +1,5 @@ +select commit.collection as event, count(*) as count from bluesky group by event order by count desc, event asc; +select commit.collection as event, count(*) as count, count(distinct did) as users from bluesky where kind == 'commit' and commit.operation = 'create' group by event order by count desc; +select commit.collection as event, extract(hour from epoch_time_to_timestamp(time_us/1000000)) as hour_of_day, count(*) as count from bluesky where kind = 'commit' AND commit.operation = 'create' and commit.collection IN ['app.bsky.feed.post', 'app.bsky.feed.repost', 'app.bsky.feed.like'] group by event, hour_of_day order by hour_of_day, event; +select did as user_id, min(epoch_time_to_timestamp(time_us/1000000)) as first_post_ts from bluesky where kind = 'commit' and commit.operation = 'create' and commit.collection = 'app.bsky.feed.post' group by user_id order by first_post_ts asc limit 3; +select did as user_id, 1000*extract(epoch from date_diff(coerce_to_str(max(epoch_time_to_timestamp(time_us/1000000))), coerce_to_str(min(epoch_time_to_timestamp(time_us/1000000))))) as activity_span from bluesky where kind = 'commit' and commit.operation = 'create' and commit.collection = 'app.bsky.feed.post' group by user_id order by activity_span desc limit 3; diff --git a/nostromodb/queries_formatted.sql b/nostromodb/queries_formatted.sql new file mode 100644 index 0000000..4183868 --- /dev/null +++ b/nostromodb/queries_formatted.sql @@ -0,0 +1,64 @@ +------------------------------------------------------------------------------------------------------------------------ +-- Q1 - Top event types +------------------------------------------------------------------------------------------------------------------------ + +select + commit.collection as event, + count(*) as count +from bluesky +group by event +order by count desc; + +------------------------------------------------------------------------------------------------------------------------ +-- Q2 - Top event types together with unique users per event type +------------------------------------------------------------------------------------------------------------------------ + +select + commit.collection as event, + count(*) as count, + count(distinct did) as users +from bluesky +where kind == 'commit' and commit.operation = 'create' +group by event +order by count desc; + +------------------------------------------------------------------------------------------------------------------------ +-- Q3 - When do people use BlueSky +------------------------------------------------------------------------------------------------------------------------ + +select + commit.collection as event, + extract(hour from epoch_time_to_timestamp(time_us/1000000)) as hour_of_day, + count(*) as count +from bluesky +where kind = 'commit' AND commit.operation = 'create' and commit.collection IN ['app.bsky.feed.post', 'app.bsky.feed.repost', 'app.bsky.feed.like'] +group by event, hour_of_day +order by hour_of_day, event; + +------------------------------------------------------------------------------------------------------------------------ +-- Q4 - top 3 post veterans +------------------------------------------------------------------------------------------------------------------------ + +select + did as user_id, + min(epoch_time_to_timestamp(time_us/1000000)) as first_post_ts +from bluesky +where kind = 'commit' and commit.operation = 'create' and commit.collection = 'app.bsky.feed.post' +group by user_id +order by first_post_ts asc +limit 3; + +------------------------------------------------------------------------------------------------------------------------ +-- Q5 - top 3 users with longest activity +------------------------------------------------------------------------------------------------------------------------ + +select + did as user_id, + 1000*extract(epoch from date_diff( + coerce_to_str(max(epoch_time_to_timestamp(time_us/1000000))), + coerce_to_str(min(epoch_time_to_timestamp(time_us/1000000))))) as activity_span +from bluesky +where kind = 'commit' and commit.operation = 'create' and commit.collection = 'app.bsky.feed.post' +group by user_id +order by activity_span desc +limit 3; diff --git a/nostromodb/query_results.sh b/nostromodb/query_results.sh new file mode 100755 index 0000000..9c2641e --- /dev/null +++ b/nostromodb/query_results.sh @@ -0,0 +1,25 @@ +#!/bin/bash + +# Check if the required arguments are provided +if [[ $# -lt 2 ]]; then + echo "Usage: $0 " + exit 1 +fi + +# Arguments +DATA_DIR="$1" +DATABASE_PATH="$2" + +QUERY_NUM=1 + +cat queries.sql | while read -r query; do + # Print the query + echo "------------------------------------------------------------------------------------------------------------------------" + echo "Result for query Q$QUERY_NUM:" + echo + + ./run_statements.sh "$DATA_DIR" "$DATABASE_PATH" "$query" + + # Increment the query number + QUERY_NUM=$((QUERY_NUM + 1)) +done; diff --git a/nostromodb/run_queries.sh b/nostromodb/run_queries.sh new file mode 100755 index 0000000..04aa9cc --- /dev/null +++ b/nostromodb/run_queries.sh @@ -0,0 +1,26 @@ +#!/bin/bash + +# Check if the required arguments are provided +if [[ $# -lt 1 ]]; then + echo "Usage: $0 " + exit 1 +fi + +# Arguments +DATA_DIR="$1" +DB_PATH="$2" + +TRIES=3 + +cat queries.sql | while read -r query; do + # Clear filesystem cache between queries. + sync + echo 3 | sudo tee /proc/sys/vm/drop_caches >/dev/null + + echo "Running query: $query" + for i in $(seq 1 $TRIES); do + # Run query with profiling enabled and extract the real time. + REAL_TIME=$(./run_statements.sh "$DATA_DIR" "$DB_PATH" "profile $query" | grep -i "optimization took" | awk '{print substr($3, 1, length($3)-3)}') + echo "Real time: $REAL_TIME seconds" + done +done diff --git a/nostromodb/run_statements.sh b/nostromodb/run_statements.sh new file mode 100755 index 0000000..67cc310 --- /dev/null +++ b/nostromodb/run_statements.sh @@ -0,0 +1,15 @@ +#!/bin/bash + +# Check if the required arguments are provided +if [[ $# -lt 3 ]]; then + echo "Usage: $0 " + exit 1 +fi + +# Arguments +DATA_PATH="$1" +DATABASE_PATH="$2" +QUERY_TEXT="$3" + +#echo "Running statement:" $QUERY_TEXT $DATA_PATH $DATABASE_PATH +docker run --ulimit nofile=65535:65535 -v "$DATA_PATH":/data -v "$DATABASE_PATH":/db svilenmihaylov/nostromodb --db_path /db/catalog.json --c "$QUERY_TEXT" diff --git a/nostromodb/total_size.sh b/nostromodb/total_size.sh new file mode 100755 index 0000000..0757cd6 --- /dev/null +++ b/nostromodb/total_size.sh @@ -0,0 +1,12 @@ +#!/bin/bash + +# Check if the required arguments are provided +if [[ $# -lt 1 ]]; then + echo "Usage: $0 " + exit 1 +fi + +# Arguments +DATABASE_NAME="$1" + +du "$DATABASE_NAME" | awk '{print $1 * 1024}' diff --git a/nostromodb/uninstall.sh b/nostromodb/uninstall.sh new file mode 100755 index 0000000..63dfe8c --- /dev/null +++ b/nostromodb/uninstall.sh @@ -0,0 +1,6 @@ +#!/bin/bash + +# stop and remove all docker containers +docker stop $(docker ps -a -q) +docker rm $(docker ps -a -q) +sudo snap remove --purge docker From 047b80b157ee98f695d04707963ec2a5b9692fb7 Mon Sep 17 00:00:00 2001 From: Svilen Mihaylov <172523102+svilen-mihaylov-db@users.noreply.github.com> Date: Sun, 19 Apr 2026 19:18:47 -0400 Subject: [PATCH 2/2] Populate results/ with output for 1m dataset. --- nostromodb/query_log.txt | 20 ++++ .../results/_m6i.8xlarge_bluesky_1m.count | 7 ++ .../results/_m6i.8xlarge_bluesky_1m.data_size | 1 + ...6i.8xlarge_bluesky_1m.physical_query_plans | 95 +++++++++++++++++++ .../_m6i.8xlarge_bluesky_1m.query_results | 82 ++++++++++++++++ .../_m6i.8xlarge_bluesky_1m.results_runtime | 5 + 6 files changed, 210 insertions(+) create mode 100644 nostromodb/query_log.txt create mode 100644 nostromodb/results/_m6i.8xlarge_bluesky_1m.count create mode 100644 nostromodb/results/_m6i.8xlarge_bluesky_1m.data_size create mode 100644 nostromodb/results/_m6i.8xlarge_bluesky_1m.physical_query_plans create mode 100644 nostromodb/results/_m6i.8xlarge_bluesky_1m.query_results create mode 100644 nostromodb/results/_m6i.8xlarge_bluesky_1m.results_runtime diff --git a/nostromodb/query_log.txt b/nostromodb/query_log.txt new file mode 100644 index 0000000..f4bd280 --- /dev/null +++ b/nostromodb/query_log.txt @@ -0,0 +1,20 @@ +Running query: select commit.collection as event, count(*) as count from bluesky group by event order by count desc, event asc; +Real time: 0.02692273 seconds +Real time: 0.01694564 seconds +Real time: 0.01638707 seconds +Running query: select commit.collection as event, count(*) as count, count(distinct did) as users from bluesky where kind == 'commit' and commit.operation = 'create' group by event order by count desc; +Real time: 0.09046148 seconds +Real time: 0.09066723 seconds +Real time: 0.06633239 seconds +Running query: select commit.collection as event, extract(hour from epoch_time_to_timestamp(time_us/1000000)) as hour_of_day, count(*) as count from bluesky where kind = 'commit' AND commit.operation = 'create' and commit.collection IN ['app.bsky.feed.post', 'app.bsky.feed.repost', 'app.bsky.feed.like'] group by event, hour_of_day order by hour_of_day, event; +Real time: 0.07178712 seconds +Real time: 0.05345615 seconds +Real time: 0.07188973 seconds +Running query: select did as user_id, min(epoch_time_to_timestamp(time_us/1000000)) as first_post_ts from bluesky where kind = 'commit' and commit.operation = 'create' and commit.collection = 'app.bsky.feed.post' group by user_id order by first_post_ts asc limit 3; +Real time: 0.08329340 seconds +Real time: 0.05289203 seconds +Real time: 0.05629961 seconds +Running query: select did as user_id, 1000*extract(epoch from date_diff(coerce_to_str(max(epoch_time_to_timestamp(time_us/1000000))), coerce_to_str(min(epoch_time_to_timestamp(time_us/1000000))))) as activity_span from bluesky where kind = 'commit' and commit.operation = 'create' and commit.collection = 'app.bsky.feed.post' group by user_id order by activity_span desc limit 3; +Real time: 0.08881190 seconds +Real time: 0.06515721 seconds +Real time: 0.06154216 seconds diff --git a/nostromodb/results/_m6i.8xlarge_bluesky_1m.count b/nostromodb/results/_m6i.8xlarge_bluesky_1m.count new file mode 100644 index 0000000..c0a1436 --- /dev/null +++ b/nostromodb/results/_m6i.8xlarge_bluesky_1m.count @@ -0,0 +1,7 @@ +Opened database at '/db/catalog.json' +╭───────────────╮ +│ count(*): int │ +├───────────────┤ +│ 1000000 │ +╰───────────────╯ +1 value(s) returned. diff --git a/nostromodb/results/_m6i.8xlarge_bluesky_1m.data_size b/nostromodb/results/_m6i.8xlarge_bluesky_1m.data_size new file mode 100644 index 0000000..71a6d39 --- /dev/null +++ b/nostromodb/results/_m6i.8xlarge_bluesky_1m.data_size @@ -0,0 +1 @@ +209420288 diff --git a/nostromodb/results/_m6i.8xlarge_bluesky_1m.physical_query_plans b/nostromodb/results/_m6i.8xlarge_bluesky_1m.physical_query_plans new file mode 100644 index 0000000..dcc6967 --- /dev/null +++ b/nostromodb/results/_m6i.8xlarge_bluesky_1m.physical_query_plans @@ -0,0 +1,95 @@ +------------------------------------------------------------------------------------------------------------------------ +Physical query plan for query Q1: + +Opened database at '/db/catalog.json' +╭───────────────────────────────────────────────────────────────────╮ +│ Stage: str │ +├───────────────────────────────────────────────────────────────────┤ +│ Result[$14, $15] │ +│ Eval[$15=coerce_to_json($13)] │ +│ Eval[$14=coerce_to_json($12)] │ +│ Sort[Desc($6), Asc($1), $1->$12, $6->$13] │ +│ HashGroup[$1=$10 | $6=Sum($11)] │ +│ MergeSegments[] │ +│ HashGroup[$10=$7 | $11=Sum(_internal_limiter(1, $7))] │ +│ Scan['BLUESKY', 'commit.collection': $7=$$] │ +╰───────────────────────────────────────────────────────────────────╯ +8 value(s) returned. +------------------------------------------------------------------------------------------------------------------------ +Physical query plan for query Q2: + +Opened database at '/db/catalog.json' +╭───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╮ +│ Stage: str │ +├───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┤ +│ Result[$21, $22, $23] │ +│ Eval[$23=coerce_to_json($20)] │ +│ Eval[$22=coerce_to_json($19)] │ +│ Eval[$21=coerce_to_json($18)] │ +│ Sort[Desc($8), $3->$18, $8->$19, $11->$20] │ +│ HashGroup[$3=$15 | $8=Sum($16), $11=CountDistinctArrEl($17)] │ +│ MergeSegments[] │ +│ HashGroup[$15=$12 | $16=Sum(_internal_limiter(1, $12)), $17=Set($14)] │ +│ Scan['BLUESKY', 'commit.collection': $12=$$, 'commit.operation': ="create", 'did': $14=$$, 'kind': ="commit"] │ +╰───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯ +9 value(s) returned. +------------------------------------------------------------------------------------------------------------------------ +Physical query plan for query Q3: + +Opened database at '/db/catalog.json' +╭──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╮ +│ Stage: str │ +├──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┤ +│ Result[$24, $25, $26] │ +│ Eval[$26=coerce_to_json($23)] │ +│ Eval[$25=coerce_to_json($22)] │ +│ Eval[$24=coerce_to_json($21)] │ +│ Sort[Asc($5), Asc($4), $4->$21, $5->$22, $12->$23] │ +│ HashGroup[$4=$18, $5=$19 | $12=Sum($20)] │ +│ MergeSegments[] │ +│ HashGroup[$18=$13, $19=$14 | $20=Sum(_internal_limiter(1, $13))] │ +│ Eval[$14=epoch_time_to_hours($17)] │ +│ Scan['BLUESKY', 'commit.collection': ((="app.bsky.feed.post" Or ="app.bsky.feed.repost") Or ="app.bsky.feed.like"), $13=$$, 'commit.operation': ="create", 'kind': ="commit", 'time_us': $17=(coerce_to_double($$)/1000000.0)] │ +╰──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯ +10 value(s) returned. +------------------------------------------------------------------------------------------------------------------------ +Physical query plan for query Q4: + +Opened database at '/db/catalog.json' +╭─────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╮ +│ Stage: str │ +├─────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┤ +│ Result[$20, $21] │ +│ Eval[$21=coerce_to_json($19)] │ +│ Eval[$20=coerce_to_json($18)] │ +│ LimitSkip[3, 0, $9->$19, $16->$18] │ +│ Eval[$9=epoch_time_to_timestamp($17)] │ +│ Sort[Asc($12), $4->$16, $12->$17] │ +│ HashGroup[$4=$14 | $12=Min($15)] │ +│ MergeSegments[] │ +│ HashGroup[$14=$10 | $15=Min($13)] │ +│ Scan['BLUESKY', 'commit.collection': ="app.bsky.feed.post", 'commit.operation': ="create", 'did': $10=$$, 'kind': ="commit", 'time_us': $13=(coerce_to_double($$)/1000000.0)] │ +╰─────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯ +10 value(s) returned. +------------------------------------------------------------------------------------------------------------------------ +Physical query plan for query Q5: + +Opened database at '/db/catalog.json' +╭───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╮ +│ Stage: str │ +├───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┤ +│ Result[$28, $29] │ +│ Eval[$29=coerce_to_json($27)] │ +│ Eval[$28=coerce_to_json($26)] │ +│ LimitSkip[3, 0, $7->$27, $23->$26] │ +│ Eval[$7=(1000.0*timestamp_to_epoch_time(date_diff($9, $11)))] │ +│ Eval[$9=epoch_time_to_timestamp($24)] │ +│ Eval[$11=epoch_time_to_timestamp($25)] │ +│ Sort[Desc($19), $4->$23, $14->$24, $15->$25] │ +│ Eval[$19=($14-$15)] │ +│ HashGroup[$4=$20 | $14=Max($21), $15=Min($22)] │ +│ MergeSegments[] │ +│ HashGroup[$20=$12 | $21=Max($16), $22=Min($16)] │ +│ Scan['BLUESKY', 'commit.collection': ="app.bsky.feed.post", 'commit.operation': ="create", 'did': $12=$$, 'kind': ="commit", 'time_us': $16=(coerce_to_double($$)/1000000.0)] │ +╰───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯ +13 value(s) returned. diff --git a/nostromodb/results/_m6i.8xlarge_bluesky_1m.query_results b/nostromodb/results/_m6i.8xlarge_bluesky_1m.query_results new file mode 100644 index 0000000..ed94a09 --- /dev/null +++ b/nostromodb/results/_m6i.8xlarge_bluesky_1m.query_results @@ -0,0 +1,82 @@ +------------------------------------------------------------------------------------------------------------------------ +Result for query Q1: + +Opened database at '/db/catalog.json' +╭────────────────────────────┬────────────╮ +│ event: str │ count: int │ +├────────────────────────────┼────────────┤ +│ app.bsky.feed.like │ 448944 │ +│ app.bsky.graph.follow │ 360374 │ +│ app.bsky.feed.post │ 90816 │ +│ app.bsky.feed.repost │ 58540 │ +│ app.bsky.graph.block │ 14040 │ +│ app.bsky.actor.profile │ 11762 │ +│ app.bsky.graph.listitem │ 8103 │ +│ │ 5328 │ +│ app.bsky.graph.listblock │ 895 │ +│ app.bsky.graph.starterpack │ 405 │ +│ app.bsky.graph.list │ 356 │ +│ app.bsky.feed.threadgate │ 255 │ +│ app.bsky.feed.postgate │ 104 │ +│ app.bsky.feed.generator │ 74 │ +│ app.bsky.labeler.service │ 4 │ +╰────────────────────────────┴────────────╯ +15 value(s) returned. +------------------------------------------------------------------------------------------------------------------------ +Result for query Q2: + +Opened database at '/db/catalog.json' +╭────────────────────────────┬────────────┬─────────────╮ +│ event: str │ count: int │ users: uint │ +├────────────────────────────┼────────────┼─────────────┤ +│ app.bsky.feed.like │ 444523 │ 117617 │ +│ app.bsky.graph.follow │ 337978 │ 63957 │ +│ app.bsky.feed.post │ 86812 │ 50464 │ +│ app.bsky.feed.repost │ 56993 │ 26581 │ +│ app.bsky.graph.block │ 13838 │ 5785 │ +│ app.bsky.graph.listitem │ 7568 │ 1078 │ +│ app.bsky.actor.profile │ 5337 │ 5337 │ +│ app.bsky.graph.listblock │ 860 │ 449 │ +│ app.bsky.graph.list │ 259 │ 218 │ +│ app.bsky.feed.threadgate │ 228 │ 196 │ +│ app.bsky.graph.starterpack │ 104 │ 101 │ +│ app.bsky.feed.postgate │ 101 │ 82 │ +│ app.bsky.feed.generator │ 10 │ 9 │ +╰────────────────────────────┴────────────┴─────────────╯ +13 value(s) returned. +------------------------------------------------------------------------------------------------------------------------ +Result for query Q3: + +Opened database at '/db/catalog.json' +╭──────────────────────┬───────────────────┬────────────╮ +│ event: str │ hour_of_day: uint │ count: int │ +├──────────────────────┼───────────────────┼────────────┤ +│ app.bsky.feed.like │ 16 │ 444523 │ +│ app.bsky.feed.post │ 16 │ 86812 │ +│ app.bsky.feed.repost │ 16 │ 56993 │ +╰──────────────────────┴───────────────────┴────────────╯ +3 value(s) returned. +------------------------------------------------------------------------------------------------------------------------ +Result for query Q4: + +Opened database at '/db/catalog.json' +╭──────────────────────────────────┬───────────────────────────────╮ +│ user_id: str │ first_post_ts: str │ +├──────────────────────────────────┼───────────────────────────────┤ +│ did:plc:yj3sjq3blzpynh27cumnp5ks │ 2024-11-21 16:25:49.000166912 │ +│ did:plc:l5o3qjrmfztir54cpwlv2eme │ 2024-11-21 16:25:49.001904896 │ +│ did:plc:s4bwqchfzm6gjqfeb6mexgbu │ 2024-11-21 16:25:49.003907072 │ +╰──────────────────────────────────┴───────────────────────────────╯ +3 value(s) returned. +------------------------------------------------------------------------------------------------------------------------ +Result for query Q5: + +Opened database at '/db/catalog.json' +╭──────────────────────────────────┬────────────────────╮ +│ user_id: str │ activity_span: dbl │ +├──────────────────────────────────┼────────────────────┤ +│ did:plc:tsyymlun4eqjuw7hqrhmwagd │ 813006.958961 │ +│ did:plc:3ug235sfy2pz7cawmpsftb65 │ 811602.261066 │ +│ did:plc:doxhhgtxqiv47tmcovpbcqai │ 811404.021024 │ +╰──────────────────────────────────┴────────────────────╯ +3 value(s) returned. diff --git a/nostromodb/results/_m6i.8xlarge_bluesky_1m.results_runtime b/nostromodb/results/_m6i.8xlarge_bluesky_1m.results_runtime new file mode 100644 index 0000000..2ebe4cb --- /dev/null +++ b/nostromodb/results/_m6i.8xlarge_bluesky_1m.results_runtime @@ -0,0 +1,5 @@ +[0.02692273,0.01694564,0.01638707], +[0.09046148,0.09066723,0.06633239], +[0.07178712,0.05345615,0.07188973], +[0.08329340,0.05289203,0.05629961], +[0.08881190,0.06515721,0.06154216],