Skip to content

Commit 68cfcf9

Browse files
sjaakolajanlindstrom
authored andcommitted
MDEV-29512 deadlock between commit monitor and THD::LOCK_thd_data mutex
This commit contains only a mtr test for reproducing the issue in MDEV-29512 The actual fix will be pushed in wsrep-lib repository The hanging in MDEV-29512 happens when binlog purging is attempted, and there is one local BF aborted transaction waiting for commit monitor. The test will launch two node cluster and enable binlogging with expire log days, to force binlog purging to happen. A local transaction is executed so that will become BF abort victim, and has advanced to replication stage waiting for commit monitor for final cleanup (to mark position in innodb) after that, applier is released to complete the BF abort and due to binlog configuration, starting the binlog purging. This is where the hanging would occur, if code is buggy Reviewed-by: Jan Lindström <jan.lindstrom@mariadb.com>
1 parent cd97523 commit 68cfcf9

File tree

3 files changed

+146
-0
lines changed

3 files changed

+146
-0
lines changed
Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,40 @@
1+
connection node_2;
2+
connection node_1;
3+
CREATE TABLE t1 (f1 INTEGER PRIMARY KEY, f2 int, f3 varchar(2000));
4+
INSERT INTO t1 VALUES (1, 0, REPEAT('1234567890', 200));
5+
INSERT INTO t1 VALUES (3, 3, REPEAT('1234567890', 200));
6+
SET SESSION wsrep_sync_wait=0;
7+
SET GLOBAL DEBUG_DBUG = "d,sync.wsrep_apply_cb";
8+
connect node_1a, 127.0.0.1, root, , test, $NODE_MYPORT_1;
9+
connection node_1a;
10+
SET SESSION wsrep_sync_wait=0;
11+
connection node_1;
12+
begin;
13+
select f1,f2 from t1;
14+
f1 f2
15+
1 0
16+
3 3
17+
connection node_2;
18+
UPDATE t1 SET f2=2 WHERE f1=3;
19+
connection node_1a;
20+
SET SESSION DEBUG_SYNC = "now WAIT_FOR sync.wsrep_apply_cb_reached";
21+
connection node_1;
22+
UPDATE t1 SET f2=1 WHERE f1=3;
23+
SET GLOBAL wsrep_provider_options = 'dbug=d,commit_monitor_master_enter_sync';
24+
COMMIT;
25+
connection node_1a;
26+
SET SESSION wsrep_on = 0;
27+
SET SESSION wsrep_on = 1;
28+
SET GLOBAL wsrep_provider_options = 'dbug=';
29+
SET GLOBAL wsrep_provider_options = 'signal=commit_monitor_master_enter_sync';
30+
SET GLOBAL DEBUG_DBUG = "";
31+
SET DEBUG_SYNC = "now SIGNAL signal.wsrep_apply_cb";
32+
SET GLOBAL debug_dbug = NULL;
33+
SET debug_sync='RESET';
34+
connection node_1;
35+
ERROR 40001: Deadlock found when trying to get lock; try restarting transaction
36+
select f1,f2 from t1;
37+
f1 f2
38+
1 0
39+
3 2
40+
DROP TABLE t1;
Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
!include ../galera_2nodes.cnf
2+
3+
[mysqld]
4+
log-bin
5+
log-slave-updates
6+
7+
[mysqld.1]
8+
log_bin
9+
log_slave_updates
10+
max-binlog-size=4096
11+
expire-logs-days=1
12+
13+
14+
[mysqld.2]
15+
Lines changed: 91 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,91 @@
1+
#
2+
# This test is for reproducing the issue in:
3+
# https://jira.mariadb.org/browse/MDEV-29512
4+
#
5+
# The hanging in MDEV-29512 happens when binlog purging is attempted, and there is
6+
# one local BF aborted transaction waiting for commit monitor.
7+
#
8+
# The test will launch two node cluster and enable binlogging with expire log days,
9+
# to force binlog purging to happen.
10+
# A local transaction is executed so that will become BF abort victim, and has advanced
11+
# to replication stage waiting for commit monitor for final cleanup (to mark position in innodb)
12+
# after that, applier is released to complete the BF abort and due to binlog configuration,
13+
# starting the binlog purging. This is where the hanging would occur, if code is buggy
14+
#
15+
--source include/galera_cluster.inc
16+
--source include/have_innodb.inc
17+
--source include/have_debug_sync.inc
18+
--source include/galera_have_debug_sync.inc
19+
20+
#
21+
# binlog size is limited to 4096 bytes, we will create enough events to
22+
# cause binlog rotation
23+
#
24+
CREATE TABLE t1 (f1 INTEGER PRIMARY KEY, f2 int, f3 varchar(2000));
25+
INSERT INTO t1 VALUES (1, 0, REPEAT('1234567890', 200));
26+
INSERT INTO t1 VALUES (3, 3, REPEAT('1234567890', 200));
27+
28+
SET SESSION wsrep_sync_wait=0;
29+
30+
# set sync point for replication applier
31+
SET GLOBAL DEBUG_DBUG = "d,sync.wsrep_apply_cb";
32+
33+
# Control connection to manage sync points for appliers
34+
--connect node_1a, 127.0.0.1, root, , test, $NODE_MYPORT_1
35+
--connection node_1a
36+
SET SESSION wsrep_sync_wait=0;
37+
38+
# starting local transaction, only select so far,
39+
# write will happen later and this will be ordered after the transaction in node_2
40+
--connection node_1
41+
begin;
42+
select f1,f2 from t1;
43+
44+
# send from node 2 an UPDATE transaction, which will BF abort the transaction in node_1
45+
--connection node_2
46+
--let $wait_condition=select count(*)=2 from t1
47+
--source include/wait_condition.inc
48+
49+
UPDATE t1 SET f2=2 WHERE f1=3;
50+
51+
--connection node_1a
52+
# wait to see the UPDATE from node_2 in apply_cb sync point
53+
SET SESSION DEBUG_SYNC = "now WAIT_FOR sync.wsrep_apply_cb_reached";
54+
55+
--connection node_1
56+
# now issuing conflicting update
57+
UPDATE t1 SET f2=1 WHERE f1=3;
58+
59+
# Block the local commit, send final COMMIT and wait until it gets blocked
60+
--let $galera_sync_point = commit_monitor_master_enter_sync
61+
--source include/galera_set_sync_point.inc
62+
--send COMMIT
63+
64+
--connection node_1a
65+
# wait for the local commit to enter in commit monitor wait state
66+
--let $galera_sync_point = commit_monitor_master_enter_sync
67+
--source include/galera_wait_sync_point.inc
68+
--source include/galera_clear_sync_point.inc
69+
70+
# release the local transaction to continue with commit
71+
--let $galera_sync_point = commit_monitor_master_enter_sync
72+
--source include/galera_signal_sync_point.inc
73+
74+
# and now release the applier, it should force local trx to abort
75+
SET GLOBAL DEBUG_DBUG = "";
76+
SET DEBUG_SYNC = "now SIGNAL signal.wsrep_apply_cb";
77+
SET GLOBAL debug_dbug = NULL;
78+
SET debug_sync='RESET';
79+
80+
--connection node_1
81+
--error ER_LOCK_DEADLOCK
82+
--reap
83+
84+
# wait until applying is complete
85+
--let $wait_condition = SELECT COUNT(*)=1 FROM t1 WHERE f2=2
86+
--source include/wait_condition.inc
87+
88+
# final read to verify what we got
89+
select f1,f2 from t1;
90+
91+
DROP TABLE t1;

0 commit comments

Comments
 (0)