Skip to content

Commit 789d653

Browse files
committed
Set next multixid's offset when creating a new multixid
With this commit, the next multixid's offset will always be set on the offsets page, by the time that a backend might try to read it, so we no longer need the waiting mechanism with the condition variable. In other words, this eliminates "corner case 2" mentioned in the comments. The waiting mechanism was broken in a few scenarios: - When nextMulti was advanced without WAL-logging the next multixid. For example, if a later multixid was already assigned and WAL-logged before the previous one was WAL-logged, and then the server crashed. In that case the next offset would never be set in the offsets SLRU, and a query trying to read it would get stuck waiting for it. Same thing could happen if pg_resetwal was used to forcibly advance nextMulti. - In hot standby mode, a deadlock could happen where one backend waits for the next multixid assignment record, but WAL replay is not advancing because of a recovery conflict with the waiting backend. The old TAP test used carefully placed injection points to exercise the old waiting code, but now that the waiting code is gone, much of the old test is no longer relevant. Rewrite the test to reproduce the IPC/MultixactCreation hang after crash recovery instead, and to verify that previously recorded multixids stay readable. Backpatch to all supported versions. In back-branches, we still need to be able to read WAL that was generated before this fix, so in the back-branches this includes a hack to initialize the next offsets page when replaying XLOG_MULTIXACT_CREATE_ID for the last multixid on a page. On 'master', bump XLOG_PAGE_MAGIC instead to indicate that the WAL is not compatible. Author: Andrey Borodin <amborodin@acm.org> Reviewed-by: Dmitry Yurichev <dsy.075@yandex.ru> Reviewed-by: Álvaro Herrera <alvherre@kurilemu.de> Reviewed-by: Kirill Reshke <reshkekirill@gmail.com> Reviewed-by: Ivan Bykov <i.bykov@modernsys.ru> Reviewed-by: Chao Li <li.evan.chao@gmail.com> Discussion: https://www.postgresql.org/message-id/172e5723-d65f-4eec-b512-14beacb326ce@yandex.ru Backpatch-through: 14
1 parent 9b05e2e commit 789d653

File tree

4 files changed

+127
-180
lines changed

4 files changed

+127
-180
lines changed

src/backend/access/transam/multixact.c

Lines changed: 94 additions & 90 deletions
Original file line numberDiff line numberDiff line change
@@ -79,7 +79,6 @@
7979
#include "pg_trace.h"
8080
#include "pgstat.h"
8181
#include "postmaster/autovacuum.h"
82-
#include "storage/condition_variable.h"
8382
#include "storage/pmsignal.h"
8483
#include "storage/proc.h"
8584
#include "storage/procarray.h"
@@ -271,12 +270,6 @@ typedef struct MultiXactStateData
271270
/* support for members anti-wraparound measures */
272271
MultiXactOffset offsetStopLimit;/* known if oldestOffsetKnown */
273272

274-
/*
275-
* This is used to sleep until a multixact offset is written when we want
276-
* to create the next one.
277-
*/
278-
ConditionVariable nextoff_cv;
279-
280273
/*
281274
* Per-backend data starts here. We have two arrays stored in the area
282275
* immediately following the MultiXactStateData struct. Each is indexed by
@@ -912,13 +905,33 @@ RecordNewMultiXact(MultiXactId multi, MultiXactOffset offset,
912905
intentryno;
913906
intslotno;
914907
MultiXactOffset *offptr;
915-
inti;
908+
MultiXactId next;
909+
int64next_pageno;
910+
intnext_entryno;
911+
MultiXactOffset *next_offptr;
916912
LWLock *lock;
917913
LWLock *prevlock = NULL;
918914

915+
/* position of this multixid in the offsets SLRU area */
919916
pageno = MultiXactIdToOffsetPage(multi);
920917
entryno = MultiXactIdToOffsetEntry(multi);
921918

919+
/* position of the next multixid */
920+
next = multi + 1;
921+
if (next < FirstMultiXactId)
922+
next = FirstMultiXactId;
923+
next_pageno = MultiXactIdToOffsetPage(next);
924+
next_entryno = MultiXactIdToOffsetEntry(next);
925+
926+
/*
927+
* Set the starting offset of this multixid's members.
928+
*
929+
* In the common case, it was already be set by the previous
930+
* RecordNewMultiXact call, as this was the next multixid of the previous
931+
* multixid. But if multiple backends are generating multixids
932+
* concurrently, we might race ahead and get called before the previous
933+
* multixid.
934+
*/
922935
lock = SimpleLruGetBankLock(MultiXactOffsetCtl, pageno);
923936
LWLockAcquire(lock, LW_EXCLUSIVE);
924937

@@ -933,22 +946,50 @@ RecordNewMultiXact(MultiXactId multi, MultiXactOffset offset,
933946
offptr = (MultiXactOffset *) MultiXactOffsetCtl->shared->page_buffer[slotno];
934947
offptr += entryno;
935948

936-
*offptr = offset;
949+
if (*offptr != offset)
950+
{
951+
/* should already be set to the correct value, or not at all */
952+
Assert(*offptr == 0);
953+
*offptr = offset;
954+
MultiXactOffsetCtl->shared->page_dirty[slotno] = true;
955+
}
956+
957+
/*
958+
* Set the next multixid's offset to the end of this multixid's members.
959+
*/
960+
if (next_pageno == pageno)
961+
{
962+
next_offptr = offptr + 1;
963+
}
964+
else
965+
{
966+
/* must be the first entry on the page */
967+
Assert(next_entryno == 0 || next == FirstMultiXactId);
968+
969+
/* Swap the lock for a lock on the next page */
970+
LWLockRelease(lock);
971+
lock = SimpleLruGetBankLock(MultiXactOffsetCtl, next_pageno);
972+
LWLockAcquire(lock, LW_EXCLUSIVE);
973+
974+
slotno = SimpleLruReadPage(MultiXactOffsetCtl, next_pageno, true, next);
975+
next_offptr = (MultiXactOffset *) MultiXactOffsetCtl->shared->page_buffer[slotno];
976+
next_offptr += next_entryno;
977+
}
937978

938-
MultiXactOffsetCtl->shared->page_dirty[slotno] = true;
979+
if (*next_offptr != offset + nmembers)
980+
{
981+
/* should already be set to the correct value, or not at all */
982+
Assert(*next_offptr == 0);
983+
*next_offptr = offset + nmembers;
984+
MultiXactOffsetCtl->shared->page_dirty[slotno] = true;
985+
}
939986

940987
/* Release MultiXactOffset SLRU lock. */
941988
LWLockRelease(lock);
942989

943-
/*
944-
* If anybody was waiting to know the offset of this multixact ID we just
945-
* wrote, they can read it now, so wake them up.
946-
*/
947-
ConditionVariableBroadcast(&MultiXactState->nextoff_cv);
948-
949990
prev_pageno = -1;
950991

951-
for (i = 0; i < nmembers; i++, offset++)
992+
for (int i = 0; i < nmembers; i++, offset++)
952993
{
953994
TransactionId *memberptr;
954995
uint32 *flagsptr;
@@ -1138,8 +1179,11 @@ GetNewMultiXactId(int nmembers, MultiXactOffset *offset)
11381179
result = FirstMultiXactId;
11391180
}
11401181

1141-
/* Make sure there is room for the MXID in the file. */
1142-
ExtendMultiXactOffset(result);
1182+
/*
1183+
* Make sure there is room for the next MXID in the file. Assigning this
1184+
* MXID sets the next MXID's offset already.
1185+
*/
1186+
ExtendMultiXactOffset(result + 1);
11431187

11441188
/*
11451189
* Reserve the members space, similarly to above. Also, be careful not to
@@ -1300,11 +1344,8 @@ GetMultiXactIdMembers(MultiXactId multi, MultiXactMember **members,
13001344
inttruelength;
13011345
MultiXactId oldestMXact;
13021346
MultiXactId nextMXact;
1303-
MultiXactId tmpMXact;
1304-
MultiXactOffset nextOffset;
13051347
MultiXactMember *ptr;
13061348
LWLock *lock;
1307-
boolslept = false;
13081349

13091350
debug_elog3(DEBUG2, "GetMembers: asked for %u", multi);
13101351

@@ -1351,14 +1392,12 @@ GetMultiXactIdMembers(MultiXactId multi, MultiXactMember **members,
13511392
* error.
13521393
*
13531394
* Shared lock is enough here since we aren't modifying any global state.
1354-
* Acquire it just long enough to grab the current counter values. We may
1355-
* need both nextMXact and nextOffset; see below.
1395+
* Acquire it just long enough to grab the current counter values.
13561396
*/
13571397
LWLockAcquire(MultiXactGenLock, LW_SHARED);
13581398

13591399
oldestMXact = MultiXactState->oldestMultiXactId;
13601400
nextMXact = MultiXactState->nextMXact;
1361-
nextOffset = MultiXactState->nextOffset;
13621401

13631402
LWLockRelease(MultiXactGenLock);
13641403

@@ -1378,68 +1417,39 @@ GetMultiXactIdMembers(MultiXactId multi, MultiXactMember **members,
13781417
* Find out the offset at which we need to start reading MultiXactMembers
13791418
* and the number of members in the multixact. We determine the latter as
13801419
* the difference between this multixact's starting offset and the next
1381-
* one's. However, there are some corner cases to worry about:
1420+
* one's. However, there is one corner case to worry about:
13821421
*
1383-
* 1. This multixact may be the latest one created, in which case there is
1384-
* no next one to look at. In this case the nextOffset value we just
1385-
* saved is the correct endpoint.
1386-
*
1387-
* 2. The next multixact may still be in process of being filled in: that
1388-
* is, another process may have done GetNewMultiXactId but not yet written
1389-
* the offset entry for that ID. In that scenario, it is guaranteed that
1390-
* the offset entry for that multixact exists (because GetNewMultiXactId
1391-
* won't release MultiXactGenLock until it does) but contains zero
1392-
* (because we are careful to pre-zero offset pages). Because
1393-
* GetNewMultiXactId will never return zero as the starting offset for a
1394-
* multixact, when we read zero as the next multixact's offset, we know we
1395-
* have this case. We handle this by sleeping on the condition variable
1396-
* we have just for this; the process in charge will signal the CV as soon
1397-
* as it has finished writing the multixact offset.
1398-
*
1399-
* 3. Because GetNewMultiXactId increments offset zero to offset one to
1400-
* handle case #2, there is an ambiguity near the point of offset
1422+
* Because GetNewMultiXactId skips over offset zero, to reserve zero for
1423+
* to mean "unset", there is an ambiguity near the point of offset
14011424
* wraparound. If we see next multixact's offset is one, is that our
14021425
* multixact's actual endpoint, or did it end at zero with a subsequent
14031426
* increment? We handle this using the knowledge that if the zero'th
14041427
* member slot wasn't filled, it'll contain zero, and zero isn't a valid
14051428
* transaction ID so it can't be a multixact member. Therefore, if we
14061429
* read a zero from the members array, just ignore it.
1407-
*
1408-
* This is all pretty messy, but the mess occurs only in infrequent corner
1409-
* cases, so it seems better than holding the MultiXactGenLock for a long
1410-
* time on every multixact creation.
14111430
*/
1412-
retry:
14131431
pageno = MultiXactIdToOffsetPage(multi);
14141432
entryno = MultiXactIdToOffsetEntry(multi);
14151433

14161434
/* Acquire the bank lock for the page we need. */
14171435
lock = SimpleLruGetBankLock(MultiXactOffsetCtl, pageno);
14181436
LWLockAcquire(lock, LW_EXCLUSIVE);
14191437

1438+
/* read this multi's offset */
14201439
slotno = SimpleLruReadPage(MultiXactOffsetCtl, pageno, true, multi);
14211440
offptr = (MultiXactOffset *) MultiXactOffsetCtl->shared->page_buffer[slotno];
14221441
offptr += entryno;
14231442
offset = *offptr;
14241443

14251444
Assert(offset != 0);
14261445

1427-
/*
1428-
* Use the same increment rule as GetNewMultiXactId(), that is, don't
1429-
* handle wraparound explicitly until needed.
1430-
*/
1431-
tmpMXact = multi + 1;
1432-
1433-
if (nextMXact == tmpMXact)
1434-
{
1435-
/* Corner case 1: there is no next multixact */
1436-
length = nextOffset - offset;
1437-
}
1438-
else
1446+
/* read next multi's offset */
14391447
{
1448+
MultiXactId tmpMXact;
14401449
MultiXactOffset nextMXOffset;
14411450

14421451
/* handle wraparound if needed */
1452+
tmpMXact = multi + 1;
14431453
if (tmpMXact < FirstMultiXactId)
14441454
tmpMXact = FirstMultiXactId;
14451455

@@ -1472,31 +1482,18 @@ GetMultiXactIdMembers(MultiXactId multi, MultiXactMember **members,
14721482
nextMXOffset = *offptr;
14731483

14741484
if (nextMXOffset == 0)
1475-
{
1476-
/* Corner case 2: next multixact is still being filled in */
1477-
LWLockRelease(lock);
1478-
CHECK_FOR_INTERRUPTS();
1479-
1480-
INJECTION_POINT("multixact-get-members-cv-sleep", NULL);
1481-
1482-
ConditionVariableSleep(&MultiXactState->nextoff_cv,
1483-
WAIT_EVENT_MULTIXACT_CREATION);
1484-
slept = true;
1485-
goto retry;
1486-
}
1485+
ereport(ERROR,
1486+
(errcode(ERRCODE_DATA_CORRUPTED),
1487+
errmsg("MultiXact %u has invalid next offset",
1488+
multi)));
14871489

14881490
length = nextMXOffset - offset;
14891491
}
14901492

14911493
LWLockRelease(lock);
14921494
lock = NULL;
14931495

1494-
/*
1495-
* If we slept above, clean up state; it's no longer needed.
1496-
*/
1497-
if (slept)
1498-
ConditionVariableCancelSleep();
1499-
1496+
/* read the members */
15001497
ptr = (MultiXactMember *) palloc(length * sizeof(MultiXactMember));
15011498

15021499
truelength = 0;
@@ -1539,7 +1536,7 @@ GetMultiXactIdMembers(MultiXactId multi, MultiXactMember **members,
15391536

15401537
if (!TransactionIdIsValid(*xactptr))
15411538
{
1542-
/* Corner case 3: we must be looking at unused slot zero */
1539+
/* Corner case: we must be looking at unused slot zero */
15431540
Assert(offset == 0);
15441541
continue;
15451542
}
@@ -1986,7 +1983,6 @@ MultiXactShmemInit(void)
19861983

19871984
/* Make sure we zero out the per-backend state */
19881985
MemSet(MultiXactState, 0, SHARED_MULTIXACT_STATE_SIZE);
1989-
ConditionVariableInit(&MultiXactState->nextoff_cv);
19901986
}
19911987
else
19921988
Assert(found);
@@ -2132,26 +2128,34 @@ TrimMultiXact(void)
21322128
pageno);
21332129

21342130
/*
2135-
* Zero out the remainder of the current offsets page. See notes in
2136-
* TrimCLOG() for background. Unlike CLOG, some WAL record covers every
2137-
* pg_multixact SLRU mutation. Since, also unlike CLOG, we ignore the WAL
2138-
* rule "write xlog before data," nextMXact successors may carry obsolete,
2139-
* nonzero offset values. Zero those so case 2 of GetMultiXactIdMembers()
2140-
* operates normally.
2131+
* Set the offset of nextMXact on the offsets page. This is normally done
2132+
* in RecordNewMultiXact() of the previous multixact, but let's be sure
2133+
* the next page exists, if the nextMXact was reset with pg_resetwal for
2134+
* example.
2135+
*
2136+
* Zero out the remainder of the page. See notes in TrimCLOG() for
2137+
* background. Unlike CLOG, some WAL record covers every pg_multixact
2138+
* SLRU mutation. Since, also unlike CLOG, we ignore the WAL rule "write
2139+
* xlog before data," nextMXact successors may carry obsolete, nonzero
2140+
* offset values.
21412141
*/
21422142
entryno = MultiXactIdToOffsetEntry(nextMXact);
2143-
if (entryno != 0)
21442143
{
21452144
intslotno;
21462145
MultiXactOffset *offptr;
21472146
LWLock *lock = SimpleLruGetBankLock(MultiXactOffsetCtl, pageno);
21482147

21492148
LWLockAcquire(lock, LW_EXCLUSIVE);
2150-
slotno = SimpleLruReadPage(MultiXactOffsetCtl, pageno, true, nextMXact);
2149+
if (entryno == 0)
2150+
slotno = SimpleLruZeroPage(MultiXactOffsetCtl, pageno);
2151+
else
2152+
slotno = SimpleLruReadPage(MultiXactOffsetCtl, pageno, true, nextMXact);
21512153
offptr = (MultiXactOffset *) MultiXactOffsetCtl->shared->page_buffer[slotno];
21522154
offptr += entryno;
21532155

2154-
MemSet(offptr, 0, BLCKSZ - (entryno * sizeof(MultiXactOffset)));
2156+
*offptr = offset;
2157+
if (entryno != 0 && (entryno + 1) * sizeof(MultiXactOffset) != BLCKSZ)
2158+
MemSet(offptr + 1, 0, BLCKSZ - (entryno + 1) * sizeof(MultiXactOffset));
21552159

21562160
MultiXactOffsetCtl->shared->page_dirty[slotno] = true;
21572161
LWLockRelease(lock);

src/include/access/xlog_internal.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,7 @@
3131
/*
3232
* Each page of XLOG file has a header like this:
3333
*/
34-
#define XLOG_PAGE_MAGIC 0xD119/* can be used as WAL version indicator */
34+
#define XLOG_PAGE_MAGIC 0xD11A/* can be used as WAL version indicator */
3535

3636
typedef struct XLogPageHeaderData
3737
{

0 commit comments

Comments
 (0)