7979#include "pg_trace.h"
8080#include "pgstat.h"
8181#include "postmaster/autovacuum.h"
82- #include "storage/condition_variable.h"
8382#include "storage/pmsignal.h"
8483#include "storage/proc.h"
8584#include "storage/procarray.h"
@@ -271,12 +270,6 @@ typedef struct MultiXactStateData
271270/* support for members anti-wraparound measures */
272271MultiXactOffset offsetStopLimit ;/* known if oldestOffsetKnown */
273272
274- /*
275- * This is used to sleep until a multixact offset is written when we want
276- * to create the next one.
277- */
278- ConditionVariable nextoff_cv ;
279-
280273/*
281274 * Per-backend data starts here. We have two arrays stored in the area
282275 * immediately following the MultiXactStateData struct. Each is indexed by
@@ -912,13 +905,33 @@ RecordNewMultiXact(MultiXactId multi, MultiXactOffset offset,
912905int entryno ;
913906int slotno ;
914907MultiXactOffset * offptr ;
915- int i ;
908+ MultiXactId next ;
909+ int64 next_pageno ;
910+ int next_entryno ;
911+ MultiXactOffset * next_offptr ;
916912LWLock * lock ;
917913LWLock * prevlock = NULL ;
918914
915+ /* position of this multixid in the offsets SLRU area */
919916pageno = MultiXactIdToOffsetPage (multi );
920917entryno = MultiXactIdToOffsetEntry (multi );
921918
919+ /* position of the next multixid */
920+ next = multi + 1 ;
921+ if (next < FirstMultiXactId )
922+ next = FirstMultiXactId ;
923+ next_pageno = MultiXactIdToOffsetPage (next );
924+ next_entryno = MultiXactIdToOffsetEntry (next );
925+
926+ /*
927+ * Set the starting offset of this multixid's members.
928+ *
929+ * In the common case, it was already be set by the previous
930+ * RecordNewMultiXact call, as this was the next multixid of the previous
931+ * multixid. But if multiple backends are generating multixids
932+ * concurrently, we might race ahead and get called before the previous
933+ * multixid.
934+ */
922935lock = SimpleLruGetBankLock (MultiXactOffsetCtl , pageno );
923936LWLockAcquire (lock , LW_EXCLUSIVE );
924937
@@ -933,22 +946,50 @@ RecordNewMultiXact(MultiXactId multi, MultiXactOffset offset,
933946offptr = (MultiXactOffset * ) MultiXactOffsetCtl -> shared -> page_buffer [slotno ];
934947offptr += entryno ;
935948
936- * offptr = offset ;
949+ if (* offptr != offset )
950+ {
951+ /* should already be set to the correct value, or not at all */
952+ Assert (* offptr == 0 );
953+ * offptr = offset ;
954+ MultiXactOffsetCtl -> shared -> page_dirty [slotno ] = true;
955+ }
956+
957+ /*
958+ * Set the next multixid's offset to the end of this multixid's members.
959+ */
960+ if (next_pageno == pageno )
961+ {
962+ next_offptr = offptr + 1 ;
963+ }
964+ else
965+ {
966+ /* must be the first entry on the page */
967+ Assert (next_entryno == 0 || next == FirstMultiXactId );
968+
969+ /* Swap the lock for a lock on the next page */
970+ LWLockRelease (lock );
971+ lock = SimpleLruGetBankLock (MultiXactOffsetCtl , next_pageno );
972+ LWLockAcquire (lock , LW_EXCLUSIVE );
973+
974+ slotno = SimpleLruReadPage (MultiXactOffsetCtl , next_pageno , true, next );
975+ next_offptr = (MultiXactOffset * ) MultiXactOffsetCtl -> shared -> page_buffer [slotno ];
976+ next_offptr += next_entryno ;
977+ }
937978
938- MultiXactOffsetCtl -> shared -> page_dirty [slotno ] = true;
979+ if (* next_offptr != offset + nmembers )
980+ {
981+ /* should already be set to the correct value, or not at all */
982+ Assert (* next_offptr == 0 );
983+ * next_offptr = offset + nmembers ;
984+ MultiXactOffsetCtl -> shared -> page_dirty [slotno ] = true;
985+ }
939986
940987/* Release MultiXactOffset SLRU lock. */
941988LWLockRelease (lock );
942989
943- /*
944- * If anybody was waiting to know the offset of this multixact ID we just
945- * wrote, they can read it now, so wake them up.
946- */
947- ConditionVariableBroadcast (& MultiXactState -> nextoff_cv );
948-
949990prev_pageno = -1 ;
950991
951- for (i = 0 ; i < nmembers ; i ++ , offset ++ )
992+ for (int i = 0 ; i < nmembers ; i ++ , offset ++ )
952993{
953994TransactionId * memberptr ;
954995uint32 * flagsptr ;
@@ -1138,8 +1179,11 @@ GetNewMultiXactId(int nmembers, MultiXactOffset *offset)
11381179result = FirstMultiXactId ;
11391180}
11401181
1141- /* Make sure there is room for the MXID in the file. */
1142- ExtendMultiXactOffset (result );
1182+ /*
1183+ * Make sure there is room for the next MXID in the file. Assigning this
1184+ * MXID sets the next MXID's offset already.
1185+ */
1186+ ExtendMultiXactOffset (result + 1 );
11431187
11441188/*
11451189 * Reserve the members space, similarly to above. Also, be careful not to
@@ -1300,11 +1344,8 @@ GetMultiXactIdMembers(MultiXactId multi, MultiXactMember **members,
13001344int truelength ;
13011345MultiXactId oldestMXact ;
13021346MultiXactId nextMXact ;
1303- MultiXactId tmpMXact ;
1304- MultiXactOffset nextOffset ;
13051347MultiXactMember * ptr ;
13061348LWLock * lock ;
1307- bool slept = false;
13081349
13091350debug_elog3 (DEBUG2 , "GetMembers: asked for %u" , multi );
13101351
@@ -1351,14 +1392,12 @@ GetMultiXactIdMembers(MultiXactId multi, MultiXactMember **members,
13511392 * error.
13521393 *
13531394 * Shared lock is enough here since we aren't modifying any global state.
1354- * Acquire it just long enough to grab the current counter values. We may
1355- * need both nextMXact and nextOffset; see below.
1395+ * Acquire it just long enough to grab the current counter values.
13561396 */
13571397LWLockAcquire (MultiXactGenLock , LW_SHARED );
13581398
13591399oldestMXact = MultiXactState -> oldestMultiXactId ;
13601400nextMXact = MultiXactState -> nextMXact ;
1361- nextOffset = MultiXactState -> nextOffset ;
13621401
13631402LWLockRelease (MultiXactGenLock );
13641403
@@ -1378,68 +1417,39 @@ GetMultiXactIdMembers(MultiXactId multi, MultiXactMember **members,
13781417 * Find out the offset at which we need to start reading MultiXactMembers
13791418 * and the number of members in the multixact. We determine the latter as
13801419 * the difference between this multixact's starting offset and the next
1381- * one's. However, there are some corner cases to worry about:
1420+ * one's. However, there is one corner case to worry about:
13821421 *
1383- * 1. This multixact may be the latest one created, in which case there is
1384- * no next one to look at. In this case the nextOffset value we just
1385- * saved is the correct endpoint.
1386- *
1387- * 2. The next multixact may still be in process of being filled in: that
1388- * is, another process may have done GetNewMultiXactId but not yet written
1389- * the offset entry for that ID. In that scenario, it is guaranteed that
1390- * the offset entry for that multixact exists (because GetNewMultiXactId
1391- * won't release MultiXactGenLock until it does) but contains zero
1392- * (because we are careful to pre-zero offset pages). Because
1393- * GetNewMultiXactId will never return zero as the starting offset for a
1394- * multixact, when we read zero as the next multixact's offset, we know we
1395- * have this case. We handle this by sleeping on the condition variable
1396- * we have just for this; the process in charge will signal the CV as soon
1397- * as it has finished writing the multixact offset.
1398- *
1399- * 3. Because GetNewMultiXactId increments offset zero to offset one to
1400- * handle case #2, there is an ambiguity near the point of offset
1422+ * Because GetNewMultiXactId skips over offset zero, to reserve zero for
1423+ * to mean "unset", there is an ambiguity near the point of offset
14011424 * wraparound. If we see next multixact's offset is one, is that our
14021425 * multixact's actual endpoint, or did it end at zero with a subsequent
14031426 * increment? We handle this using the knowledge that if the zero'th
14041427 * member slot wasn't filled, it'll contain zero, and zero isn't a valid
14051428 * transaction ID so it can't be a multixact member. Therefore, if we
14061429 * read a zero from the members array, just ignore it.
1407- *
1408- * This is all pretty messy, but the mess occurs only in infrequent corner
1409- * cases, so it seems better than holding the MultiXactGenLock for a long
1410- * time on every multixact creation.
14111430 */
1412- retry :
14131431pageno = MultiXactIdToOffsetPage (multi );
14141432entryno = MultiXactIdToOffsetEntry (multi );
14151433
14161434/* Acquire the bank lock for the page we need. */
14171435lock = SimpleLruGetBankLock (MultiXactOffsetCtl , pageno );
14181436LWLockAcquire (lock , LW_EXCLUSIVE );
14191437
1438+ /* read this multi's offset */
14201439slotno = SimpleLruReadPage (MultiXactOffsetCtl , pageno , true, multi );
14211440offptr = (MultiXactOffset * ) MultiXactOffsetCtl -> shared -> page_buffer [slotno ];
14221441offptr += entryno ;
14231442offset = * offptr ;
14241443
14251444Assert (offset != 0 );
14261445
1427- /*
1428- * Use the same increment rule as GetNewMultiXactId(), that is, don't
1429- * handle wraparound explicitly until needed.
1430- */
1431- tmpMXact = multi + 1 ;
1432-
1433- if (nextMXact == tmpMXact )
1434- {
1435- /* Corner case 1: there is no next multixact */
1436- length = nextOffset - offset ;
1437- }
1438- else
1446+ /* read next multi's offset */
14391447{
1448+ MultiXactId tmpMXact ;
14401449MultiXactOffset nextMXOffset ;
14411450
14421451/* handle wraparound if needed */
1452+ tmpMXact = multi + 1 ;
14431453if (tmpMXact < FirstMultiXactId )
14441454tmpMXact = FirstMultiXactId ;
14451455
@@ -1472,31 +1482,18 @@ GetMultiXactIdMembers(MultiXactId multi, MultiXactMember **members,
14721482nextMXOffset = * offptr ;
14731483
14741484if (nextMXOffset == 0 )
1475- {
1476- /* Corner case 2: next multixact is still being filled in */
1477- LWLockRelease (lock );
1478- CHECK_FOR_INTERRUPTS ();
1479-
1480- INJECTION_POINT ("multixact-get-members-cv-sleep" , NULL );
1481-
1482- ConditionVariableSleep (& MultiXactState -> nextoff_cv ,
1483- WAIT_EVENT_MULTIXACT_CREATION );
1484- slept = true;
1485- goto retry ;
1486- }
1485+ ereport (ERROR ,
1486+ (errcode (ERRCODE_DATA_CORRUPTED ),
1487+ errmsg ("MultiXact %u has invalid next offset" ,
1488+ multi )));
14871489
14881490length = nextMXOffset - offset ;
14891491}
14901492
14911493LWLockRelease (lock );
14921494lock = NULL ;
14931495
1494- /*
1495- * If we slept above, clean up state; it's no longer needed.
1496- */
1497- if (slept )
1498- ConditionVariableCancelSleep ();
1499-
1496+ /* read the members */
15001497ptr = (MultiXactMember * ) palloc (length * sizeof (MultiXactMember ));
15011498
15021499truelength = 0 ;
@@ -1539,7 +1536,7 @@ GetMultiXactIdMembers(MultiXactId multi, MultiXactMember **members,
15391536
15401537if (!TransactionIdIsValid (* xactptr ))
15411538{
1542- /* Corner case 3 : we must be looking at unused slot zero */
1539+ /* Corner case: we must be looking at unused slot zero */
15431540Assert (offset == 0 );
15441541continue ;
15451542}
@@ -1986,7 +1983,6 @@ MultiXactShmemInit(void)
19861983
19871984/* Make sure we zero out the per-backend state */
19881985MemSet (MultiXactState , 0 , SHARED_MULTIXACT_STATE_SIZE );
1989- ConditionVariableInit (& MultiXactState -> nextoff_cv );
19901986}
19911987else
19921988Assert (found );
@@ -2132,26 +2128,34 @@ TrimMultiXact(void)
21322128pageno );
21332129
21342130/*
2135- * Zero out the remainder of the current offsets page. See notes in
2136- * TrimCLOG() for background. Unlike CLOG, some WAL record covers every
2137- * pg_multixact SLRU mutation. Since, also unlike CLOG, we ignore the WAL
2138- * rule "write xlog before data," nextMXact successors may carry obsolete,
2139- * nonzero offset values. Zero those so case 2 of GetMultiXactIdMembers()
2140- * operates normally.
2131+ * Set the offset of nextMXact on the offsets page. This is normally done
2132+ * in RecordNewMultiXact() of the previous multixact, but let's be sure
2133+ * the next page exists, if the nextMXact was reset with pg_resetwal for
2134+ * example.
2135+ *
2136+ * Zero out the remainder of the page. See notes in TrimCLOG() for
2137+ * background. Unlike CLOG, some WAL record covers every pg_multixact
2138+ * SLRU mutation. Since, also unlike CLOG, we ignore the WAL rule "write
2139+ * xlog before data," nextMXact successors may carry obsolete, nonzero
2140+ * offset values.
21412141 */
21422142entryno = MultiXactIdToOffsetEntry (nextMXact );
2143- if (entryno != 0 )
21442143{
21452144int slotno ;
21462145MultiXactOffset * offptr ;
21472146LWLock * lock = SimpleLruGetBankLock (MultiXactOffsetCtl , pageno );
21482147
21492148LWLockAcquire (lock , LW_EXCLUSIVE );
2150- slotno = SimpleLruReadPage (MultiXactOffsetCtl , pageno , true, nextMXact );
2149+ if (entryno == 0 )
2150+ slotno = SimpleLruZeroPage (MultiXactOffsetCtl , pageno );
2151+ else
2152+ slotno = SimpleLruReadPage (MultiXactOffsetCtl , pageno , true, nextMXact );
21512153offptr = (MultiXactOffset * ) MultiXactOffsetCtl -> shared -> page_buffer [slotno ];
21522154offptr += entryno ;
21532155
2154- MemSet (offptr , 0 , BLCKSZ - (entryno * sizeof (MultiXactOffset )));
2156+ * offptr = offset ;
2157+ if (entryno != 0 && (entryno + 1 ) * sizeof (MultiXactOffset ) != BLCKSZ )
2158+ MemSet (offptr + 1 , 0 , BLCKSZ - (entryno + 1 ) * sizeof (MultiXactOffset ));
21552159
21562160MultiXactOffsetCtl -> shared -> page_dirty [slotno ] = true;
21572161LWLockRelease (lock );
0 commit comments