Skip to content

Commit a785d67

Browse files
committed
Bug #17878183 NDB CONTINUOUSLY REBOOTING DUE TO FILE NOT FOUND (DBLQH: CAUSED BY ERROR 2341)
Fix for this bug. Root cause : - DICT overload resulting in GETTABINFOREF - Backup block failing to complete LCP_PREPARE due to GETTABINFOREF - LQH ignoring failure to start LCP - Node restart finding LCP file missing, and cannot handle. Fixes 1. LQH no longer ignores failure to start LCP 2. DICT no longer REFs internal requests This patch adds 1. A new testcase (testNodeRestart -nGetTabInfoOverload) 2. A new queueing mechanism inside DICT, separating internal + external requests 3. A new SegmentList utils class for kernel code Merge to 7.2+ requires some modifications : - Integrate with more recent fixes there. Applied to 7.1 as 'backport' is requested there.
1 parent af54cd7 commit a785d67

File tree

14 files changed

+1891
-58
lines changed

14 files changed

+1891
-58
lines changed

storage/ndb/src/kernel/blocks/dbdict/Dbdict.cpp

Lines changed: 85 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
/*
2-
Copyright (c) 2003, 2014, Oracle and/or its affiliates. All rights reserved.
2+
Copyright (c) 2003, 2015, Oracle and/or its affiliates. All rights reserved.
33

44
This program is free software; you can redistribute it and/or modify
55
it under the terms of the GNU General Public License as published by
@@ -445,6 +445,10 @@ void Dbdict::execCONTINUEB(Signal* signal)
445445
jam();
446446
trans_commit_wait_gci(signal);
447447
break;
448+
case ZNEXT_GET_TAB_REQ:
449+
jam();
450+
startNextGetTabInfoReq(signal);
451+
break;
448452
default :
449453
ndbrequire(false);
450454
break;
@@ -486,7 +490,7 @@ void Dbdict::packTableIntoPages(Signal* signal)
486490
jam();
487491
sendGET_TABINFOREF(signal, &req_copy,
488492
GetTabInfoRef::TableNotDefined, __LINE__);
489-
initRetrieveRecord(0, 0, 0);
493+
initRetrieveRecord(signal, 0, 0);
490494
return;
491495
}
492496

@@ -506,7 +510,7 @@ void Dbdict::packTableIntoPages(Signal* signal)
506510
// cannot see another uncommitted trans
507511
sendGET_TABINFOREF(signal, &req_copy,
508512
(GetTabInfoRef::ErrorCode)err, __LINE__);
509-
initRetrieveRecord(0, 0, 0);
513+
initRetrieveRecord(signal, 0, 0);
510514
return;
511515
}
512516
}
@@ -534,7 +538,7 @@ void Dbdict::packTableIntoPages(Signal* signal)
534538
jam();
535539
sendGET_TABINFOREF(signal, &req_copy,
536540
GetTabInfoRef::TableNotDefined, __LINE__);
537-
initRetrieveRecord(0, 0, 0);
541+
initRetrieveRecord(signal, 0, 0);
538542
return;
539543
}
540544

@@ -551,7 +555,7 @@ void Dbdict::packTableIntoPages(Signal* signal)
551555
Uint32 dstRef = c_retrieveRecord.blockRef;
552556
sendSignal(dstRef, GSN_GET_TABINFOREF, signal,
553557
GetTabInfoRef::SignalLength, JBB);
554-
initRetrieveRecord(0,0,0);
558+
initRetrieveRecord(signal,0,0);
555559
return;
556560
}
557561
break;
@@ -1977,6 +1981,7 @@ Dbdict::Dbdict(Block_context& ctx):
19771981
c_attributeRecordHash(c_attributeRecordPool),
19781982
c_obj_name_hash(c_obj_pool),
19791983
c_obj_id_hash(c_obj_pool),
1984+
c_gettabinforeq_q(*this),
19801985
c_schemaOpHash(c_schemaOpPool),
19811986
c_schemaTransHash(c_schemaTransPool),
19821987
c_schemaTransList(c_schemaTransPool),
@@ -2308,14 +2313,34 @@ void Dbdict::initWriteSchemaRecord()
23082313

23092314
void Dbdict::initRetrieveRecord(Signal* signal, Uint32 i, Uint32 returnCode)
23102315
{
2311-
c_retrieveRecord.busyState = false;
2316+
jam();
23122317
c_retrieveRecord.blockRef = 0;
23132318
c_retrieveRecord.m_senderData = RNIL;
23142319
c_retrieveRecord.tableId = RNIL;
23152320
c_retrieveRecord.currentSent = 0;
23162321
c_retrieveRecord.retrievedNoOfPages = 0;
23172322
c_retrieveRecord.retrievedNoOfWords = 0;
23182323
c_retrieveRecord.m_useLongSig = false;
2324+
2325+
if (!c_gettabinforeq_q.isEmpty())
2326+
{
2327+
jam();
2328+
ndbrequire(signal != NULL);
2329+
2330+
/* Take a real-time break now, CONTINUEB will
2331+
* start processing the next request.
2332+
* busyState = true will maintain fairness
2333+
*/
2334+
signal->theData[0] = ZNEXT_GET_TAB_REQ;
2335+
sendSignal(reference(), GSN_CONTINUEB, signal,
2336+
1, JBB);
2337+
}
2338+
else
2339+
{
2340+
/* Done */
2341+
c_retrieveRecord.busyState = false;
2342+
}
2343+
23192344
}//initRetrieveRecord()
23202345

23212346
void Dbdict::initSchemaRecord()
@@ -2823,6 +2848,9 @@ void Dbdict::execREAD_CONFIG_REQ(Signal* signal)
28232848
bat[1].bits.q = ZLOG_SIZE_OF_PAGES_IN_WORDS; // 2**13 = 8192 elements
28242849
bat[1].bits.v = 5; // 32 bits per element
28252850

2851+
// Initialize Segment Sub pool in GetTabInfoReq queue */
2852+
ndbrequire(c_gettabinforeq_q.init(jamBuffer()));
2853+
28262854
initCommonData();
28272855
initRecords();
28282856

@@ -10033,60 +10061,52 @@ void Dbdict::execGET_TABINFOREQ(Signal* signal)
1003310061
}
1003410062

1003510063
GetTabInfoReq * const req = (GetTabInfoReq *)&signal->theData[0];
10036-
SectionHandle handle(this, signal);
1003710064

10038-
/**
10039-
* If I get a GET_TABINFO_REQ from myself
10040-
* it's is a one from the time queue
10041-
*/
10042-
bool fromTimeQueue = (signal->senderBlockRef() == reference());
10043-
10044-
if (ERROR_INSERTED(6215) && fromTimeQueue == false)
10065+
if (ERROR_INSERTED(6215) &&
10066+
(signal->senderBlockRef() != reference()))
1004510067
{
1004610068
jam();
1004710069
// API tries 100 times and (80/100)^100 is quite small..
1004810070
if (rand() % 100 >= 20)
1004910071
{
1005010072
jam();
10073+
SectionHandle handle(this, signal);
1005110074
releaseSections(handle);
1005210075
sendGET_TABINFOREF(signal, req, GetTabInfoRef::Busy, __LINE__);
1005310076
return;
1005410077
}
1005510078
// no CLEAR_ERROR_INSERT_VALUE
1005610079
}
1005710080

10058-
if (c_retrieveRecord.busyState && fromTimeQueue == true) {
10059-
jam();
10060-
10061-
sendSignalWithDelay(reference(), GSN_GET_TABINFOREQ, signal, 30,
10062-
signal->length(),
10063-
&handle);
10064-
return;
10065-
}//if
10066-
10067-
const Uint32 MAX_WAITERS = 5;
10068-
10069-
if(c_retrieveRecord.busyState && fromTimeQueue == false)
10081+
if (c_retrieveRecord.busyState)
1007010082
{
1007110083
jam();
10072-
if(c_retrieveRecord.noOfWaiters < MAX_WAITERS){
10084+
NodeInfo sendersNI = getNodeInfo(refToNode(req->senderRef));
10085+
bool internalReq = (sendersNI.m_type == NodeInfo::DB);
10086+
10087+
/* Queue request
10088+
* Will be processed later when current requests + queue are completed
10089+
*/
10090+
if (!c_gettabinforeq_q.tryEnqReq(internalReq,
10091+
signal))
10092+
{
1007310093
jam();
10074-
c_retrieveRecord.noOfWaiters++;
10094+
/**
10095+
* Enqueue failure resulting in Busy signal only allowed for
10096+
* external requests
10097+
*/
10098+
ndbrequire(!internalReq);
1007510099

10076-
sendSignalWithDelay(reference(), GSN_GET_TABINFOREQ, signal, 30,
10077-
signal->length(),
10078-
&handle);
10079-
return;
10100+
SectionHandle handle(this, signal);
10101+
releaseSections(handle);
10102+
10103+
sendGET_TABINFOREF(signal, req, GetTabInfoRef::Busy, __LINE__);
1008010104
}
10081-
releaseSections(handle);
10082-
sendGET_TABINFOREF(signal, req, GetTabInfoRef::Busy, __LINE__);
10105+
1008310106
return;
1008410107
}
1008510108

10086-
if(fromTimeQueue){
10087-
jam();
10088-
c_retrieveRecord.noOfWaiters--;
10089-
}
10109+
SectionHandle handle(this, signal);
1009010110

1009110111
const bool useLongSig = (req->requestType & GetTabInfoReq::LongSignalConf);
1009210112
const bool byName = (req->requestType & GetTabInfoReq::RequestByName);
@@ -29799,3 +29819,30 @@ Dbdict::send_event(Signal* signal,
2979929819
signal->theData[4] = refToNode(trans_ptr.p->m_clientRef);
2980029820
sendSignal(CMVMI_REF, GSN_EVENT_REP, signal, 5, JBB);
2980129821
}
29822+
29823+
void
29824+
Dbdict::startNextGetTabInfoReq(Signal* signal)
29825+
{
29826+
jam();
29827+
29828+
/* Retrieve record should be in busy state to block
29829+
* queue-jumpers. We can clear it now as we will
29830+
* execute direct from the head of the queue(s)
29831+
*/
29832+
ndbrequire(c_retrieveRecord.busyState);
29833+
ndbrequire(!c_gettabinforeq_q.isEmpty());
29834+
29835+
/* Directly start next queued request
29836+
* Prefer internalQueue, but give externalQueue entries
29837+
* a proportional share to avoid starvation.
29838+
*/
29839+
ndbrequire(c_gettabinforeq_q.deqReq(signal));
29840+
29841+
c_retrieveRecord.busyState = false;
29842+
29843+
/* Execute...using EXECUTE_DIRECT to get signal trace */
29844+
EXECUTE_DIRECT(number(),
29845+
GSN_GET_TABINFOREQ,
29846+
signal,
29847+
GetTabInfoReq::SignalLength);
29848+
}

0 commit comments

Comments
 (0)