Skip to content

Commit 1524735

Browse files
lnguyen-yugabyteLinh Nguyen
authored andcommitted
[#14559][YSQL][YCQL] Implement select distinct pushdown to DocDB
Summary: We implement pushdown for SELECT DISTINCT query. We extend the Hybrid Scan work to support scanning tuple prefixes. Specifically, with a given prefix length, Hybrid Scan will advance to the next prefix that is different from the previous one. We need to determine the prefix length to be used when scanning in DocDB. This should equal the index of the last column to be requested in the scan. Test Plan: unit test added. A example to demonstrate pushdown working as intended: Populate data ``` yugabyte=# create table t(h int, c int); CREATE TABLE create index idx on t(h ASC, c ASC); yugabyte=# insert into t (select 1, i from generate_series(1, 1000000) as i); INSERT 0 1000000 yugabyte=# insert into t (select 2, i from generate_series(1, 1000000) as i); INSERT 0 1000000 ``` Before ``` yugabyte=# explain analyze select distinct h from t where h <= 2; QUERY PLAN ------------------------------------------------------------------------------------------------------------------------------- Unique (cost=0.00..15.50 rows=82 width=4) (actual time=7.259..12649.788 rows=2 loops=1) -> Index Only Scan using idx on t (cost=0.00..15.25 rows=100 width=4) (actual time=7.255..12415.214 rows=2000000 loops=1) Index Cond: (h <= 2) Heap Fetches: 0 Planning Time: 0.094 ms Execution Time: 12649.868 ms Peak Memory Usage: 8 kB (7 rows) ``` After ``` yugabyte=# explain analyze select distinct h from t where h <= 2; QUERY PLAN --------------------------------------------------------------------------------------------------------------------- Unique (cost=0.00..15.50 rows=82 width=4) (actual time=2.182..2.191 rows=2 loops=1) -> Index Only Scan using idx on t (cost=0.00..15.25 rows=100 width=4) (actual time=2.178..2.183 rows=2 loops=1) Index Cond: (h <= 2) Heap Fetches: 0 Planning Time: 0.113 ms Execution Time: 2.274 ms Peak Memory Usage: 8 kB (7 rows) ``` Reviewers: smishra, amartsinchyk, pjain, tnayak Reviewed By: tnayak Subscribers: rskannan, yql, kannan, smishra Differential Revision: https://phabricator.dev.yugabyte.com/D20742
1 parent c97640f commit 1524735

File tree

20 files changed

+375
-37
lines changed

20 files changed

+375
-37
lines changed

java/yb-cql/src/test/java/org/yb/cql/TestSelect.java

Lines changed: 58 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2252,6 +2252,64 @@ public void testDistinct() throws Exception {
22522252
runInvalidQuery("select distinct h, s from test_distinct where c < 0;");
22532253
}
22542254

2255+
@Test
2256+
public void testDistinctPushdown() throws Exception {
2257+
session.execute("create table t(h int, c int, primary key(h, c))");
2258+
session.execute("insert into t(h, c) values (0, 0)");
2259+
session.execute("insert into t(h, c) values (0, 1)");
2260+
session.execute("insert into t(h, c) values (0, 2)");
2261+
session.execute("insert into t(h, c) values (1, 0)");
2262+
session.execute("insert into t(h, c) values (1, 1)");
2263+
session.execute("insert into t(h, c) values (1, 2)");
2264+
2265+
// For both queries, the scan should jump directly to the relevant primary key,
2266+
// so the number of seeks is equal to the items to be retrived.
2267+
{
2268+
String query = "select distinct h from t where h = 0";
2269+
String[] rows = {"Row[0]"};
2270+
2271+
RocksDBMetrics metrics = assertPartialRangeSpec("t", query, rows);
2272+
assertEquals(1, metrics.seekCount);
2273+
}
2274+
2275+
{
2276+
String query = "select distinct h from t where h in (0, 1)";
2277+
String[] rows = {"Row[0]", "Row[1]"};
2278+
2279+
RocksDBMetrics metrics = assertPartialRangeSpec("t", query, rows);
2280+
assertEquals(2, metrics.seekCount);
2281+
}
2282+
}
2283+
2284+
@Test
2285+
public void testDistinctPushdownSecondColumn() throws Exception {
2286+
session.execute("create table t(r1 int, r2 int, r3 int, primary key(r2, r3))");
2287+
session.execute("insert into t(r1, r2, r3) values (0, 0, 0)");
2288+
session.execute("insert into t(r1, r2, r3) values (0, 0, 1)");
2289+
session.execute("insert into t(r1, r2, r3) values (0, 0, 2)");
2290+
session.execute("insert into t(r1, r2, r3) values (1, 1, 0)");
2291+
session.execute("insert into t(r1, r2, r3) values (1, 1, 1)");
2292+
session.execute("insert into t(r1, r2, r3) values (1, 1, 2)");
2293+
2294+
// For both queries, the scan should jump directly to the relevant primary key,
2295+
// so the number of seeks is equal to the items to be retrived.
2296+
{
2297+
String query = "select distinct r2 from t where r2 = 0";
2298+
String[] rows = {"Row[0]"};
2299+
2300+
RocksDBMetrics metrics = assertPartialRangeSpec("t", query, rows);
2301+
assertEquals(1, metrics.seekCount);
2302+
}
2303+
2304+
{
2305+
String query = "select distinct r2 from t where r2 in (0, 1)";
2306+
String[] rows = {"Row[0]", "Row[1]"};
2307+
2308+
RocksDBMetrics metrics = assertPartialRangeSpec("t", query, rows);
2309+
assertEquals(2, metrics.seekCount);
2310+
}
2311+
}
2312+
22552313
@Test
22562314
public void testToJson() throws Exception {
22572315
// Create test table.

java/yb-pgsql/src/test/java/org/yb/pgsql/TestPgSelect.java

Lines changed: 198 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -952,6 +952,204 @@ public void testPartialKeyScan() throws Exception {
952952
}
953953
}
954954

955+
@Test
956+
public void testIndexDistinctRangeScan() throws Exception {
957+
String query = "CREATE TABLE t(r1 INT, r2 INT, PRIMARY KEY(r1 ASC, r2 ASC))";
958+
try (Statement statement = connection.createStatement()) {
959+
statement.execute(query);
960+
961+
query = "INSERT INTO t (SELECT 1, i FROM GENERATE_SERIES(1, 10) AS i)";
962+
statement.execute(query);
963+
964+
query = "INSERT INTO t (SELECT 2, i FROM GENERATE_SERIES(1, 10) AS i)";
965+
statement.execute(query);
966+
967+
query = "INSERT INTO t (SELECT 3, i FROM GENERATE_SERIES(1, 10) AS i)";
968+
statement.execute(query);
969+
970+
Set<Row> expectedRows = new HashSet<>();
971+
expectedRows.add(new Row(1));
972+
expectedRows.add(new Row(2));
973+
expectedRows.add(new Row(3));
974+
query = "SELECT DISTINCT r1 FROM t WHERE r1 <= 3";
975+
assertRowSet(statement, query, expectedRows);
976+
977+
// With DISTINCT pushed down to DocDB, we only to scan three keys:
978+
// 1. From kLowest, seek to 1.
979+
// 2. From 1, seek to 2.
980+
// 3. From 2, seek to 3.
981+
// The constraint r1 <= 3 implies we are done after the third seek.
982+
RocksDBMetrics metrics = assertFullDocDBFilter(statement, query, "t");
983+
assertEquals(3, metrics.seekCount);
984+
}
985+
}
986+
987+
@Test
988+
public void testIndexDistinctMulticolumnsScan() throws Exception {
989+
String query = "CREATE TABLE t(r1 INT, r2 INT, r3 INT, PRIMARY KEY(r1 ASC, r2 ASC, r3 ASC))";
990+
try (Statement statement = connection.createStatement()) {
991+
statement.execute(query);
992+
993+
query = "INSERT INTO t (SELECT 1, 1, i FROM GENERATE_SERIES(1, 100) AS i)";
994+
statement.execute(query);
995+
996+
query = "INSERT INTO t (SELECT 2, 2, i FROM GENERATE_SERIES(1, 100) AS i)";
997+
statement.execute(query);
998+
999+
query = "INSERT INTO t (SELECT 3, 1, i FROM GENERATE_SERIES(1, 100) AS i)";
1000+
statement.execute(query);
1001+
1002+
query = "INSERT INTO t (SELECT 3, 2, i FROM GENERATE_SERIES(1, 100) AS i)";
1003+
statement.execute(query);
1004+
1005+
query = "INSERT INTO t (SELECT 3, 3, i FROM GENERATE_SERIES(1, 100) AS i)";
1006+
statement.execute(query);
1007+
1008+
Set<Row> expectedRows = new HashSet<>();
1009+
expectedRows.add(new Row(1, 1));
1010+
expectedRows.add(new Row(2, 2));
1011+
expectedRows.add(new Row(3, 1));
1012+
expectedRows.add(new Row(3, 2));
1013+
expectedRows.add(new Row(3, 3));
1014+
query = "SELECT DISTINCT r1, r2 FROM t WHERE r1 <= 3";
1015+
assertRowSet(statement, query, expectedRows);
1016+
1017+
// We need to do 6 seeks here:
1018+
// 1. Seek from (kLowest, kLowest), found (1, 1).
1019+
// 2. Seek from (1, 1), found (2, 2).
1020+
// 3. Seek from (2, 2), found (3, 1).
1021+
// 4. Seek from (3, 1), found (3, 2).
1022+
// 5. Seek from (3, 2), found (3, 3).
1023+
// 6. Seek from (3, 3), found no more key.
1024+
// Note that we need the last seek, since under the condition r1 <= 3, we don't know whether
1025+
// there are more items to be scanned.
1026+
RocksDBMetrics metrics = assertFullDocDBFilter(statement, query, "t");
1027+
assertEquals(6, metrics.seekCount);
1028+
}
1029+
}
1030+
1031+
@Test
1032+
public void testIndexDistinctSkipColumnScan() throws Exception {
1033+
String query = "CREATE TABLE t(r1 INT, r2 INT, r3 INT, r4 INT, " +
1034+
" PRIMARY KEY(r1 ASC, r2 ASC, r3 ASC, r4 ASC))";
1035+
1036+
try (Statement statement = connection.createStatement()) {
1037+
statement.execute(query);
1038+
1039+
query = "INSERT INTO t (SELECT 1, 1, 1, i FROM GENERATE_SERIES(1, 100) AS i)";
1040+
statement.execute(query);
1041+
1042+
query = "INSERT INTO t (SELECT 2, 2, 2, i FROM GENERATE_SERIES(1, 100) AS i)";
1043+
statement.execute(query);
1044+
1045+
query = "INSERT INTO t (SELECT 3, 3, 3, i FROM GENERATE_SERIES(1, 100) AS i)";
1046+
statement.execute(query);
1047+
1048+
Set<Row> expectedRows = new HashSet<>();
1049+
expectedRows.add(new Row(1, 1));
1050+
expectedRows.add(new Row(2, 2));
1051+
expectedRows.add(new Row(3, 3));
1052+
query = "SELECT DISTINCT r1, r3 FROM t WHERE r3 <= 3";
1053+
assertRowSet(statement, query, expectedRows);
1054+
1055+
RocksDBMetrics metrics = assertFullDocDBFilter(statement, query, "t");
1056+
assertEquals(1, metrics.seekCount);
1057+
}
1058+
}
1059+
1060+
@Test
1061+
public void testDistinctScanHashColumn() throws Exception {
1062+
String query = "CREATE TABLE t(r1 INT, r2 INT, r3 INT, PRIMARY KEY(r1 HASH, r2 ASC, r3 ASC))";
1063+
try (Statement statement = connection.createStatement()) {
1064+
statement.execute(query);
1065+
1066+
query = "INSERT INTO t (SELECT i, i, i FROM GENERATE_SERIES(1, 100) AS i)";
1067+
statement.execute(query);
1068+
1069+
{
1070+
Set<Row> expectedRows = new HashSet<>();
1071+
for (int i = 1; i <= 100; i++) {
1072+
expectedRows.add(new Row(i));
1073+
}
1074+
1075+
query = "SELECT DISTINCT r1 FROM t WHERE r1 <= 100";
1076+
assertRowSet(statement, query, expectedRows);
1077+
1078+
// Here we do a sequential scan.
1079+
RocksDBMetrics metrics = assertFullDocDBFilter(statement, query, "t");
1080+
assertEquals(3, metrics.seekCount);
1081+
}
1082+
1083+
{
1084+
Set<Row> expectedRows = new HashSet<>();
1085+
expectedRows.add(new Row(100));
1086+
1087+
query = "SELECT DISTINCT r1 FROM t WHERE r1 = 100";
1088+
assertRowSet(statement, query, expectedRows);
1089+
1090+
// Here we do an index scan.
1091+
RocksDBMetrics metrics = assertFullDocDBFilter(statement, query, "t");
1092+
assertEquals(1, metrics.seekCount);
1093+
}
1094+
}
1095+
}
1096+
1097+
@Test
1098+
public void testDistinctMultiHashColumns() throws Exception {
1099+
String query = "CREATE TABLE t(h1 INT, h2 INT, r INT, PRIMARY KEY((h1, h2) HASH, r ASC))";
1100+
try (Statement statement = connection.createStatement()) {
1101+
statement.execute(query);
1102+
1103+
query = "INSERT INTO t (SELECT i, i, i FROM GENERATE_SERIES(1, 100) AS i)";
1104+
statement.execute(query);
1105+
1106+
{
1107+
Set<Row> expectedRows = new HashSet<>();
1108+
expectedRows.add(new Row(1, 1));
1109+
1110+
query = "SELECT DISTINCT h1, h2 FROM t WHERE h1 = 1 AND h2 = 1";
1111+
assertRowSet(statement, query, expectedRows);
1112+
1113+
RocksDBMetrics metrics = assertFullDocDBFilter(statement, query, "t");
1114+
assertEquals(1, metrics.seekCount);
1115+
}
1116+
}
1117+
}
1118+
1119+
@Test
1120+
public void testDistinctOnNonPrefixScan() throws Exception {
1121+
String query = "CREATE TABLE t(r1 INT, r2 INT, r3 INT)";
1122+
try (Statement statement = connection.createStatement()) {
1123+
statement.execute(query);
1124+
1125+
query = "CREATE INDEX idx on t(r3 ASC)";
1126+
statement.execute(query);
1127+
1128+
query = "INSERT INTO t (SELECT i, 1, 1 FROM GENERATE_SERIES(1, 100) AS i)";
1129+
statement.execute(query);
1130+
1131+
{
1132+
Set<Row> expectedRows = new HashSet<>();
1133+
expectedRows.add(new Row(1));
1134+
1135+
query = "SELECT DISTINCT r3 FROM t WHERE r3 <= 10";
1136+
assertRowSet(statement, query, expectedRows);
1137+
1138+
// Here we perform the seek on the index table with two seeks:
1139+
// 1. From kLowest, we seek to 1.
1140+
// 2. From 1, we seek to the next key, which is not found.
1141+
// Note that we need to seek on the index table. The main table will result
1142+
// in zero seeks.
1143+
1144+
RocksDBMetrics metrics = assertFullDocDBFilter(statement, query, "t");
1145+
assertEquals(0, metrics.seekCount);
1146+
1147+
metrics = assertFullDocDBFilter(statement, query, "idx");
1148+
assertEquals(2, metrics.seekCount);
1149+
}
1150+
}
1151+
}
1152+
9551153
@Test
9561154
public void testStrictInequalities() throws Exception {
9571155
String query = "CREATE TABLE sample_table(h INT, r1 INT, r2 INT, r3 INT, " +

src/postgres/src/backend/executor/nodeUnique.c

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -53,6 +53,13 @@ ExecUnique(PlanState *pstate)
5353
PlanState *outerPlan;
5454

5555
CHECK_FOR_INTERRUPTS();
56+
57+
/*
58+
* SELECT DISTINCT is only enabled for an index scan. Specifically, for a scan on hash columns,
59+
* the index scan will not be used.
60+
*/
61+
if (IsYugaByteEnabled())
62+
pstate->state->yb_exec_params.is_select_distinct = true;
5663

5764
/*
5865
* get information from the node

src/yb/common/pgsql_protocol.proto

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -407,6 +407,10 @@ message PgsqlReadRequestPB {
407407
// Reading distinct columns?
408408
optional bool distinct = 11 [default = false];
409409

410+
// Current only used on SELECT DISTINCT scan. If the value is greater than 0, use the specified
411+
// prefix length to scan the table.
412+
optional uint64 prefix_length = 39 [default = 0];
413+
410414
// Flag for reading aggregate values.
411415
optional bool is_aggregate = 12 [default = false];
412416

src/yb/common/ql_protocol.proto

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -252,6 +252,10 @@ message QLReadRequestPB {
252252
// Reading distinct columns?
253253
optional bool distinct = 12 [default = false];
254254

255+
// Current only used on SELECT DISTINCT scan. If the value is greater than 0, use the specified
256+
// prefix length to scan the table.
257+
optional uint64 prefix_length = 23 [default = 0];
258+
255259
// Limit number of rows to return. For QL SELECT, this limit is the smaller of the page size (max
256260
// (max number of rows to return per fetch) & the LIMIT clause if present in the SELECT statement.
257261
optional uint64 limit = 8;

src/yb/docdb/doc_pgsql_scanspec.cc

Lines changed: 8 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,8 @@ DocPgsqlScanSpec::DocPgsqlScanSpec(const Schema& schema,
3636
const boost::optional<int32_t> hash_code,
3737
const boost::optional<int32_t> max_hash_code,
3838
const DocKey& start_doc_key,
39-
bool is_forward_scan)
39+
bool is_forward_scan,
40+
const size_t prefix_length)
4041
: PgsqlScanSpec(nullptr),
4142
schema_(schema),
4243
query_id_(query_id),
@@ -47,7 +48,8 @@ DocPgsqlScanSpec::DocPgsqlScanSpec(const Schema& schema,
4748
max_hash_code_(max_hash_code),
4849
start_doc_key_(start_doc_key.empty() ? KeyBytes() : start_doc_key.Encode()),
4950
lower_doc_key_(doc_key.Encode()),
50-
is_forward_scan_(is_forward_scan) {
51+
is_forward_scan_(is_forward_scan),
52+
prefix_length_(prefix_length) {
5153

5254
// Compute lower and upper doc_key.
5355
// We add +inf as an extra component to make sure this is greater than all keys in range.
@@ -87,7 +89,8 @@ DocPgsqlScanSpec::DocPgsqlScanSpec(
8789
const DocKey& start_doc_key,
8890
bool is_forward_scan,
8991
const DocKey& lower_doc_key,
90-
const DocKey& upper_doc_key)
92+
const DocKey& upper_doc_key,
93+
const size_t prefix_length)
9194
: PgsqlScanSpec(where_expr),
9295
range_bounds_(condition ? new QLScanRange(schema, *condition) : nullptr),
9396
schema_(schema),
@@ -100,7 +103,8 @@ DocPgsqlScanSpec::DocPgsqlScanSpec(
100103
start_doc_key_(start_doc_key.empty() ? KeyBytes() : start_doc_key.Encode()),
101104
lower_doc_key_(lower_doc_key.Encode()),
102105
upper_doc_key_(upper_doc_key.Encode()),
103-
is_forward_scan_(is_forward_scan) {
106+
is_forward_scan_(is_forward_scan),
107+
prefix_length_(prefix_length) {
104108

105109
auto lower_bound_key = bound_key(schema, true);
106110
lower_doc_key_ = lower_bound_key > lower_doc_key_

src/yb/docdb/doc_pgsql_scanspec.h

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -37,7 +37,8 @@ class DocPgsqlScanSpec : public PgsqlScanSpec {
3737
const boost::optional<int32_t> hash_code = boost::none,
3838
const boost::optional<int32_t> max_hash_code = boost::none,
3939
const DocKey& start_doc_key = DefaultStartDocKey(),
40-
bool is_forward_scan = true);
40+
bool is_forward_scan = true,
41+
const size_t prefix_length = 0);
4142

4243
// Scan for the given hash key, a condition, and optional doc_key.
4344
//
@@ -56,7 +57,8 @@ class DocPgsqlScanSpec : public PgsqlScanSpec {
5657
const DocKey& start_doc_key = DefaultStartDocKey(),
5758
bool is_forward_scan = true,
5859
const DocKey& lower_doc_key = DefaultStartDocKey(),
59-
const DocKey& upper_doc_key = DefaultStartDocKey());
60+
const DocKey& upper_doc_key = DefaultStartDocKey(),
61+
const size_t prefix_length = 0);
6062

6163
//------------------------------------------------------------------------------------------------
6264
// Access funtions.
@@ -68,6 +70,10 @@ class DocPgsqlScanSpec : public PgsqlScanSpec {
6870
return is_forward_scan_;
6971
}
7072

73+
const size_t prefix_length() const {
74+
return prefix_length_;
75+
}
76+
7177
//------------------------------------------------------------------------------------------------
7278
// Filters.
7379
std::shared_ptr<rocksdb::ReadFileFilter> CreateFileFilter() const;
@@ -161,6 +167,8 @@ class DocPgsqlScanSpec : public PgsqlScanSpec {
161167
// Scan behavior.
162168
bool is_forward_scan_;
163169

170+
size_t prefix_length_ = 0;
171+
164172
DISALLOW_COPY_AND_ASSIGN(DocPgsqlScanSpec);
165173
};
166174

0 commit comments

Comments
 (0)