Skip to content

Querying Data

Catalyzed supports standard SQL for querying your data. This guide covers common query patterns and best practices.

Use the /queries endpoint to run SQL queries against your tables. The tables parameter maps the table names used in your SQL to their table IDs.

Basic SELECT query

Terminal window
curl -X POST https://api.catalyzed.ai/queries \
-H "Authorization: Bearer $API_TOKEN" \
-H "Content-Type: application/json" \
-d '{
"sql": "SELECT * FROM users LIMIT 10",
"tables": {"users": "KzaMsfA0LSw_Ld0KyaXIS"}
}'

Query across multiple tables by including them all in the tables mapping:

JOIN across tables

Terminal window
curl -X POST https://api.catalyzed.ai/queries \
-H "Authorization: Bearer $API_TOKEN" \
-H "Content-Type: application/json" \
-d '{
"sql": "SELECT o.order_id, c.name, o.total FROM orders o JOIN customers c ON o.customer_id = c.id",
"tables": {
"orders": "Ednc5U676CO4hn-FqsXeA",
"customers": "6fTBbbj4uv8TVMVh0gVch"
}
}'
-- Select specific columns
SELECT name, email, created_at FROM users
-- Select all columns
SELECT * FROM users
-- Column aliases
SELECT name AS customer_name, email AS contact FROM users
-- Expressions
SELECT name, price * quantity AS total FROM orders
-- Basic comparisons
SELECT * FROM users WHERE status = 'active'
SELECT * FROM orders WHERE amount > 100
SELECT * FROM orders WHERE created_at >= '2024-01-01'
-- Multiple conditions
SELECT * FROM orders WHERE status = 'active' AND amount > 100
SELECT * FROM orders WHERE status = 'active' OR status = 'pending'
-- NULL checks
SELECT * FROM users WHERE deleted_at IS NULL
SELECT * FROM orders WHERE notes IS NOT NULL
-- IN operator
SELECT * FROM orders WHERE status IN ('active', 'pending', 'review')
-- LIKE pattern matching
SELECT * FROM users WHERE email LIKE '%@example.com'
SELECT * FROM users WHERE name LIKE 'John%'
-- BETWEEN range
SELECT * FROM orders WHERE amount BETWEEN 100 AND 500
-- Ascending (default)
SELECT * FROM orders ORDER BY created_at
-- Descending
SELECT * FROM orders ORDER BY created_at DESC
-- Multiple columns
SELECT * FROM orders ORDER BY status ASC, created_at DESC
-- With NULLS handling
SELECT * FROM orders ORDER BY completed_at NULLS LAST
-- First 10 rows
SELECT * FROM users LIMIT 10
-- Skip first 20, get next 10
SELECT * FROM users LIMIT 10 OFFSET 20
-- Recommended: use ORDER BY with pagination
SELECT * FROM users ORDER BY id LIMIT 10 OFFSET 20
-- Count
SELECT COUNT(*) FROM orders
SELECT COUNT(DISTINCT customer_id) FROM orders
-- Sum, Average, Min, Max
SELECT SUM(amount) FROM orders
SELECT AVG(amount) FROM orders WHERE status = 'completed'
SELECT MIN(created_at), MAX(created_at) FROM orders
-- Group by
SELECT status, COUNT(*) as count, SUM(amount) as total
FROM orders
GROUP BY status
-- Having (filter groups)
SELECT customer_id, COUNT(*) as order_count
FROM orders
GROUP BY customer_id
HAVING COUNT(*) > 5
-- INNER JOIN (only matching rows)
SELECT o.*, c.name
FROM orders o
JOIN customers c ON o.customer_id = c.id
-- LEFT JOIN (all from left, matching from right)
SELECT c.name, o.order_id
FROM customers c
LEFT JOIN orders o ON c.id = o.customer_id
-- Multiple joins
SELECT o.order_id, c.name, p.product_name
FROM orders o
JOIN customers c ON o.customer_id = c.id
JOIN products p ON o.product_id = p.id
-- Subquery in WHERE
SELECT * FROM orders
WHERE customer_id IN (
SELECT id FROM customers WHERE status = 'premium'
)
-- Subquery in FROM
SELECT avg_amount, COUNT(*) as customer_count
FROM (
SELECT customer_id, AVG(amount) as avg_amount
FROM orders
GROUP BY customer_id
) customer_averages
GROUP BY avg_amount
WITH high_value_customers AS (
SELECT customer_id, SUM(amount) as total_spent
FROM orders
GROUP BY customer_id
HAVING SUM(amount) > 10000
)
SELECT c.name, hvc.total_spent
FROM high_value_customers hvc
JOIN customers c ON hvc.customer_id = c.id
ORDER BY hvc.total_spent DESC
-- Row number
SELECT name, amount,
ROW_NUMBER() OVER (ORDER BY amount DESC) as rank
FROM orders
-- Partition by
SELECT customer_id, order_date, amount,
SUM(amount) OVER (PARTITION BY customer_id) as customer_total
FROM orders
-- Running total
SELECT order_date, amount,
SUM(amount) OVER (ORDER BY order_date) as running_total
FROM orders

Queries return results in a structured JSON format:

{
"queryId": "qry_abc123",
"columns": [
{"name": "id", "type": "Utf8"},
{"name": "name", "type": "Utf8"},
{"name": "email", "type": "Utf8"},
{"name": "created_at", "type": "Timestamp(Microsecond, Some(\"UTC\"))"}
],
"rows": [
{"id": "1", "name": "Alice", "email": "[email protected]", "created_at": "2024-01-15T10:00:00Z"},
{"id": "2", "name": "Bob", "email": "[email protected]", "created_at": "2024-01-15T11:00:00Z"}
],
"rowCount": 2,
"truncated": false
}

Prevent accidentally returning too many rows:

{
"sql": "SELECT * FROM users",
"tables": {"users": "KzaMsfA0LSw_Ld0KyaXIS"},
"maxRows": 1000
}

Preview how a query will execute without running it:

Terminal window
curl -X POST https://api.catalyzed.ai/queries/explain \
-H "Authorization: Bearer $API_TOKEN" \
-H "Content-Type: application/json" \
-d '{
"sql": "SELECT * FROM users WHERE status = '\''active'\''",
"tables": {"users": "KzaMsfA0LSw_Ld0KyaXIS"}
}'
-- Good: Select only needed columns
SELECT id, name, status FROM users
-- Avoid: Select all columns
SELECT * FROM users
-- Good: Filter in WHERE
SELECT * FROM orders WHERE created_at > '2024-01-01'
-- Avoid: Filter in application code
SELECT * FROM orders -- then filter in code
-- Good: Always limit during development
SELECT * FROM orders WHERE status = 'active' LIMIT 100
-- Avoid: Unbounded queries
SELECT * FROM orders

For columns frequently used in WHERE clauses:

Terminal window
curl -X POST https://api.catalyzed.ai/dataset-tables/KzaMsfA0LSw_Ld0KyaXIS/indexes \
-H "Authorization: Bearer $API_TOKEN" \
-H "Content-Type: application/json" \
-d '{
"indexName": "idx_status",
"columnName": "status",
"indexType": "btree"
}'
  • Use timestamp for dates, not strings
  • Use numeric types (int64, float64) for numbers, not strings
  • This enables efficient comparisons and aggregations
{
"error": "QUERY_ERROR",
"message": "SQL syntax error at position 45: unexpected token 'FORM'"
}
{
"error": "VALIDATION_ERROR",
"message": "Table 'orders' referenced in SQL but not found in tables mapping"
}

Long-running queries may timeout. Use filters and limits to reduce data scanned.

Convert natural language queries to SQL using the /queries/translate endpoint. The translator understands table schemas, column types, and supports semantic search functions.

Translate natural language to SQL

Terminal window
curl -X POST https://api.catalyzed.ai/queries/translate \
-H "Authorization: Bearer $API_TOKEN" \
-H "Content-Type: application/json" \
-d '{
"tables": {
"orders": "Ednc5U676CO4hn-FqsXeA",
"customers": "6fTBbbj4uv8TVMVh0gVch"
},
"query": "Find customers who placed orders over $1000 last month",
"includeExplanation": true
}'

Response:

{
"sql": "SELECT DISTINCT c.name, c.email FROM customers c JOIN orders o ON c.id = o.customer_id WHERE o.amount > 1000 AND o.created_at >= date_trunc('month', CURRENT_DATE - INTERVAL '1 month')",
"confidence": 0.92,
"explanation": "This query joins customers and orders tables, filters for orders over $1000, and restricts to the previous month using date_trunc."
}
  • Schema Awareness - Understands table structures and column types
  • Semantic Search Support - Recognizes knn_search(), knn_cosine(), knn_l2(), and text_to_embedding() functions
  • SQL Validation - Optionally validates generated SQL syntax
  • Confidence Scores - Returns confidence level (0.0-1.0) for the translation
  • Explanations - Optional explanation of the generated SQL
OptionTypeDescription
tablesobjectMap of table aliases (as used in SQL) to table IDs
querystringNatural language query to translate
includeExplanationbooleanInclude explanation of generated SQL (default: false)
validateSyntaxbooleanValidate SQL syntax before returning (default: true)
{
"tables": {
"products": "tbl_abc123"
},
"query": "Find products similar to wireless headphones",
"includeExplanation": true
}

Generated SQL:

SELECT * FROM knn_cosine(
'products',
'embedding',
text_to_embedding('wireless headphones'),
10
)

See the SQL Translation API for complete endpoint documentation.

Catalyzed also supports semantic similarity search using vector embeddings. Use knn_search() to find similar documents, products, or any embedded content:

SELECT * FROM knn_cosine(
'products',
'embedding',
text_to_embedding('wireless headphones'),
10
)

See the Vector Search Guide for complete documentation on:

  • knn_search(), knn_cosine(), and knn_l2() functions
  • text_to_embedding() for query-time embeddings
  • Filtered vector search
  • Distance metrics and thresholds