Payal Sinha

All about what I do, what I think... Engineering, Lessons in Corporate world, etc.

Cassandra quick ref

Connection

cqlsh                          # local connect
cqlsh <host> <port>           # remote
cqlsh -u <user> -p <pass>     # with auth

Keyspace Ops

-- create
CREATE KEYSPACE ks_name 
WITH replication = {'class':'SimpleStrategy', 'replication_factor':3};

-- use
USE ks_name;

-- list all
DESCRIBE keyspaces;

-- drop
DROP KEYSPACE ks_name;

Table Ops

-- create
CREATE TABLE users (
  user_id UUID PRIMARY KEY,
  name TEXT,
  email TEXT,
  created_at TIMESTAMP
);

-- with clustering
CREATE TABLE events (
  user_id UUID,
  event_time TIMESTAMP,
  event_type TEXT,
  PRIMARY KEY (user_id, event_time)
) WITH CLUSTERING ORDER BY (event_time DESC);

-- describe
DESCRIBE TABLE table_name;

-- alter
ALTER TABLE users ADD phone TEXT;

-- drop
DROP TABLE users;

CRUD

-- insert
INSERT INTO users (user_id, name, email) 
VALUES (uuid(), 'John', '[email protected]');

-- with TTL
INSERT INTO users (...) VALUES (...) USING TTL 86400;

-- select
SELECT * FROM users;
SELECT * FROM users WHERE user_id = <uuid>;
SELECT * FROM users LIMIT 10;

-- update
UPDATE users SET name='Jane' WHERE user_id=<uuid>;

-- delete
DELETE FROM users WHERE user_id=<uuid>;
DELETE name FROM users WHERE user_id=<uuid>;  # delete column

Indexes

-- create secondary index
CREATE INDEX ON users (email);

-- drop
DROP INDEX users_email_idx;

Batch Operations

BEGIN BATCH
  INSERT INTO users (...) VALUES (...);
  UPDATE users SET ... WHERE ...;
  DELETE FROM users WHERE ...;
APPLY BATCH;

Useful Queries

-- count (expensive!)
SELECT COUNT(*) FROM users;

-- token range
SELECT * FROM users WHERE token(user_id) > token(<uuid>);

-- allow filtering (use carefully)
SELECT * FROM users WHERE email='[email protected]' ALLOW FILTERING;

Admin/Utility

DESCRIBE CLUSTER;
DESCRIBE SCHEMA;
CONSISTENCY QUORUM;        # set consistency level
TRACING ON;                # enable query tracing
SOURCE '/path/to/file.cql'; # execute file

Data Types (common)

  • TEXT, VARCHAR
  • INT, BIGINT, SMALLINT
  • FLOAT, DOUBLE, DECIMAL
  • BOOLEAN
  • UUID, TIMEUUID
  • TIMESTAMP, DATE, TIME
  • BLOB
  • SET, LIST, MAP<type,type>

Nodetool Commands

# cluster status
nodetool status                    # cluster ring status
nodetool info                      # node info
nodetool describecluster           # cluster details
nodetool ring                      # token ring

# maintenance
nodetool repair                    # repair all keyspaces
nodetool repair -full              # full repair
nodetool repair ks_name table_name # specific table
nodetool compact                   # force compaction
nodetool cleanup                   # cleanup after topology change
nodetool flush                     # flush memtables to disk

# performance
nodetool tpstats                   # thread pool stats
nodetool tablestats                # table statistics
nodetool cfstats                   # columnfamily stats (old)
nodetool tablehistograms ks table  # latency histograms
nodetool proxyhistograms           # coordinator stats

# monitoring
nodetool netstats                  # network info
nodetool gcstats                   # GC stats
nodetool statusbinary              # native protocol status
nodetool statusthrift              # thrift status
nodetool compactionstats           # compaction progress

# node ops
nodetool drain                     # stop writes, flush
nodetool stopdaemon               # stop cassandra
nodetool assassinate <ip>          # force remove dead node
nodetool removenode <host_id>      # remove node properly
nodetool decommission             # leave cluster gracefully

# snapshots
nodetool snapshot -t snap_name     # create snapshot
nodetool listsnapshots             # list all snapshots
nodetool clearsnapshot -t name     # clear snapshot

# cache
nodetool invalidatekeycache
nodetool invalidaterowcache

# misc
nodetool settraceprobability 0.1   # set trace sampling
nodetool getlogginglevels          # check log levels
nodetool setlogginglevel class LEVEL

SSTable Tools

sstableloader -d <host> <sstable_dir>  # bulk load
sstablelevelreset <ks> <table>         # reset levels
sstablemetadata <sstable_file>         # view metadata
sstableutil <ks> <table>               # list sstables
sstabledump <sstable_file>             # dump as JSON
sstablescrub <ks> <table>              # fix corrupted sstables

Config Files

# important files
/etc/cassandra/cassandra.yaml      # main config
/etc/cassandra/cassandra-env.sh    # JVM settings
/var/log/cassandra/system.log      # main log
/var/lib/cassandra/data/           # data dir
/var/lib/cassandra/commitlog/      # commit logs

Key Yaml Settings

cluster_name
seeds: "ip1,ip2,ip3"
listen_address
rpc_address
data_file_directories
commitlog_directory
concurrent_reads: 32
concurrent_writes: 32
memtable_flush_writers: 4
compaction_throughput_mb_per_sec: 64

JVM Tuning (cassandra-env.sh)

# heap size
MAX_HEAP_SIZE="8G"
HEAP_NEWSIZE="800M"

# GC (example G1GC)
JVM_OPTS="$JVM_OPTS -XX:+UseG1GC"

Backup/Restore

# backup
nodetool snapshot -t backup_name

# find snapshots
find /var/lib/cassandra/data -name snapshots

# restore (stop node first)
# copy snapshot files to table dir
# then restart and run:
nodetool refresh ks_name table_name

Performance Checks

# check disk IO
iostat -x 5

# check network
iftop
nodetool netstats

# check compaction backlog
nodetool compactionstats

# check pending tasks
nodetool tpstats | grep -i pending

Remember

  • Primary key = partition key + clustering columns
  • Can only query by partition key or full primary key
  • ORDER BY only on clustering columns
  • No JOINs - denormalize!
  • ALLOW FILTERING is slow - avoid in prod
  • Use BATCH for same partition only (performance)
  • TTL in seconds
  • Always run nodetool repair regularly
  • Use cleanup after adding/removing nodes
  • Monitor pending compactions
  • GC pauses > 1s are bad