Skip to content

Commit d13eb66

Browse files
committed
MDEV-13831: Assertion on event group missing XID/COMMIT event
The assertion occurred in the SQL thread if an event group was incompletely written, missing the end XID or COMMIT event, and immediately followed by a new event group. This could also lead to the incomplete event group being committed, and with the wrong GTID. Fix by rolling back any active transaction from a prior event group when applying the following GTID event. Getting an incomplete event like this is somewhat rare to happen. If the server crashes in the middle of writing an event group, the server restart will write a new format description event, which makes the slave roll back the partial event group. But presumably it could happen if the master experiences temporary write errors in the binlog, like intermittent disk full for example. Signed-off-by: Kristian Nielsen <knielsen@knielsen-hq.org>
1 parent ec002a1 commit d13eb66

File tree

3 files changed

+84
-1
lines changed

3 files changed

+84
-1
lines changed
Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,29 @@
1+
include/master-slave.inc
2+
[connection master]
3+
*** MDEV-13831: Assertion on event group missing XID/COMMIT event
4+
connection slave;
5+
include/stop_slave.inc
6+
CHANGE MASTER TO Master_use_gtid= No;
7+
include/start_slave.inc
8+
connection master;
9+
SET @old_legacy= @@GLOBAL.binlog_legacy_event_pos;
10+
SET GLOBAL binlog_legacy_event_pos= 1;
11+
CREATE TABLE t1 (a INT) ENGINE=InnoDB;
12+
INSERT INTO t1 VALUES (1);
13+
connection master1;
14+
SET @old_dbug= @@SESSION.debug_dbug;
15+
SET SESSION debug_dbug = '+d,fail_binlog_write_1';
16+
UPDATE t1 SET a = 2;
17+
ERROR HY000: Error writing file 'master-bin' (errno: 28 "No space left on device")
18+
SET debug_dbug= @old_dbug;
19+
DROP TEMPORARY TABLE t1;
20+
ERROR 42S02: Unknown table 'test.t1'
21+
connection master;
22+
CREATE TEMPORARY TABLE t1 (i INT) ENGINE=InnoDB;
23+
connection slave;
24+
connection master;
25+
SET GLOBAL binlog_legacy_event_pos= @old_legacy;
26+
CALL mtr.add_suppression("Error writing file.*No space left on device");
27+
DROP TEMPORARY TABLE t1;
28+
DROP TABLE t1;
29+
include/rpl_end.inc
Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,41 @@
1+
--source include/have_debug.inc
2+
--source include/have_innodb.inc
3+
--source include/have_binlog_format_mixed.inc
4+
--source include/master-slave.inc
5+
6+
--echo *** MDEV-13831: Assertion on event group missing XID/COMMIT event
7+
8+
--connection slave
9+
--source include/stop_slave.inc
10+
# Use non-GTID mode. In GTID mode, the IO thread will fail if it sees an
11+
# incomplete event group after MDEV-27697 patch.
12+
CHANGE MASTER TO Master_use_gtid= No;
13+
--source include/start_slave.inc
14+
15+
--connection master
16+
# The dbug injection below is only active in legacy mode.
17+
SET @old_legacy= @@GLOBAL.binlog_legacy_event_pos;
18+
SET GLOBAL binlog_legacy_event_pos= 1;
19+
CREATE TABLE t1 (a INT) ENGINE=InnoDB;
20+
INSERT INTO t1 VALUES (1);
21+
22+
--connection master1
23+
SET @old_dbug= @@SESSION.debug_dbug;
24+
SET SESSION debug_dbug = '+d,fail_binlog_write_1';
25+
--error ER_ERROR_ON_WRITE
26+
UPDATE t1 SET a = 2;
27+
SET debug_dbug= @old_dbug;
28+
--error ER_BAD_TABLE_ERROR
29+
DROP TEMPORARY TABLE t1;
30+
31+
--connection master
32+
CREATE TEMPORARY TABLE t1 (i INT) ENGINE=InnoDB;
33+
34+
--sync_slave_with_master
35+
36+
--connection master
37+
SET GLOBAL binlog_legacy_event_pos= @old_legacy;
38+
CALL mtr.add_suppression("Error writing file.*No space left on device");
39+
DROP TEMPORARY TABLE t1;
40+
DROP TABLE t1;
41+
--source include/rpl_end.inc

sql/log_event_server.cc

Lines changed: 14 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3091,7 +3091,20 @@ static char gtid_begin_string[] = "BEGIN";
30913091
int
30923092
Gtid_log_event::do_apply_event(rpl_group_info *rgi)
30933093
{
3094+
Relay_log_info *rli= rgi->rli;
30943095
ulonglong bits= thd->variables.option_bits;
3096+
3097+
if (unlikely(thd->transaction->all.ha_list || (bits & OPTION_GTID_BEGIN)))
3098+
{
3099+
rli->report(WARNING_LEVEL, 0, NULL,
3100+
"Rolling back unfinished transaction (no COMMIT "
3101+
"or ROLLBACK in relay log). This indicates a corrupt binlog "
3102+
"on the master, possibly caused by disk full or other write "
3103+
"error.");
3104+
rgi->cleanup_context(thd, 1);
3105+
bits= thd->variables.option_bits;
3106+
}
3107+
30953108
thd->variables.server_id= this->server_id;
30963109
thd->variables.gtid_domain_id= this->domain_id;
30973110
thd->variables.gtid_seq_no= this->seq_no;
@@ -3110,7 +3123,7 @@ Gtid_log_event::do_apply_event(rpl_group_info *rgi)
31103123

31113124
DBUG_ASSERT((bits & OPTION_GTID_BEGIN) == 0);
31123125

3113-
Master_info *mi=rgi->rli->mi;
3126+
Master_info *mi= rli->mi;
31143127
switch (flags2 & (FL_DDL | FL_TRANSACTIONAL))
31153128
{
31163129
case FL_TRANSACTIONAL:

0 commit comments

Comments
 (0)