Skip to content

Commit dc311dc

Browse files
authored
Merge 11640a8 into ffa96b1
2 parents ffa96b1 + 11640a8 commit dc311dc

File tree

20 files changed

+745
-104
lines changed

20 files changed

+745
-104
lines changed

ydb/core/testlib/tablet_helpers.cpp

Lines changed: 39 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1151,6 +1151,7 @@ namespace NKikimr {
11511151
HFunc(TEvHive::TEvAdoptTablet, Handle);
11521152
HFunc(TEvHive::TEvDeleteTablet, Handle);
11531153
HFunc(TEvHive::TEvDeleteOwnerTablets, Handle);
1154+
HFunc(TEvHive::TEvStopTablet, Handle);
11541155
HFunc(TEvHive::TEvRequestHiveInfo, Handle);
11551156
HFunc(TEvHive::TEvInitiateTabletExternalBoot, Handle);
11561157
HFunc(TEvHive::TEvUpdateTabletsObject, Handle);
@@ -1179,7 +1180,7 @@ namespace NKikimr {
11791180

11801181
void Handle(TEvHive::TEvCreateTablet::TPtr& ev, const TActorContext& ctx) {
11811182
LOG_INFO_S(ctx, NKikimrServices::HIVE, "[" << TabletID() << "] TEvCreateTablet, msg: " << ev->Get()->Record.ShortDebugString());
1182-
Cout << "FAKEHIVE " << TabletID() << " TEvCreateTablet " << ev->Get()->Record.ShortDebugString() << Endl;
1183+
Cerr << "FAKEHIVE " << TabletID() << " TEvCreateTablet " << ev->Get()->Record.ShortDebugString() << Endl;
11831184
NKikimrProto::EReplyStatus status = NKikimrProto::OK;
11841185
const std::pair<ui64, ui64> key(ev->Get()->Record.GetOwner(), ev->Get()->Record.GetOwnerIdx());
11851186
const auto type = ev->Get()->Record.GetTabletType();
@@ -1256,6 +1257,9 @@ namespace NKikimr {
12561257
auto& boundChannels = ev->Get()->Record.GetBindedChannels();
12571258
it->second.BoundChannels.assign(boundChannels.begin(), boundChannels.end());
12581259
it->second.ChannelsProfile = ev->Get()->Record.GetChannelsProfile();
1260+
1261+
it->second.State = ETabletState::ReadyToWork;
1262+
it->second.ObjectDomain = TSubDomainKey(ev->Get()->Record.GetObjectDomain());
12591263
}
12601264

12611265
ctx.Send(ev->Sender, new TEvHive::TEvCreateTabletReply(status, key.first,
@@ -1355,7 +1359,7 @@ namespace NKikimr {
13551359
void Handle(TEvHive::TEvDeleteTablet::TPtr &ev, const TActorContext &ctx) {
13561360
LOG_INFO_S(ctx, NKikimrServices::HIVE, "[" << TabletID() << "] TEvDeleteTablet, msg: " << ev->Get()->Record.ShortDebugString());
13571361
NKikimrHive::TEvDeleteTablet& rec = ev->Get()->Record;
1358-
Cout << "FAKEHIVE " << TabletID() << " TEvDeleteTablet " << rec.ShortDebugString() << Endl;
1362+
Cerr << "FAKEHIVE " << TabletID() << " TEvDeleteTablet " << rec.ShortDebugString() << Endl;
13591363
TVector<ui64> deletedIdx;
13601364
for (size_t i = 0; i < rec.ShardLocalIdxSize(); ++i) {
13611365
auto id = std::make_pair<ui64, ui64>(rec.GetShardOwnerId(), rec.GetShardLocalIdx(i));
@@ -1368,7 +1372,7 @@ namespace NKikimr {
13681372
void Handle(TEvHive::TEvDeleteOwnerTablets::TPtr &ev, const TActorContext &ctx) {
13691373
LOG_INFO_S(ctx, NKikimrServices::HIVE, "[" << TabletID() << "] TEvDeleteOwnerTablets, msg: " << ev->Get()->Record);
13701374
NKikimrHive::TEvDeleteOwnerTablets& rec = ev->Get()->Record;
1371-
Cout << "FAKEHIVE " << TabletID() << " TEvDeleteOwnerTablets " << rec.ShortDebugString() << Endl;
1375+
Cerr << "FAKEHIVE " << TabletID() << " TEvDeleteOwnerTablets " << rec.ShortDebugString() << Endl;
13721376
auto ownerId = rec.GetOwner();
13731377
TVector<ui64> toDelete;
13741378

@@ -1400,6 +1404,34 @@ namespace NKikimr {
14001404
ctx.Send(ev->Sender, new TEvHive::TEvDeleteOwnerTabletsReply(NKikimrProto::OK, TabletID(), ownerId, rec.GetTxId()));
14011405
}
14021406

1407+
void StopTablet(const ui64& tabletId, const TActorContext &ctx) {
1408+
auto ownerIt = State->TabletIdToOwner.find(tabletId);
1409+
if (ownerIt == State->TabletIdToOwner.end()) {
1410+
return;
1411+
}
1412+
auto it = State->Tablets.find(ownerIt->second);
1413+
if (it == State->Tablets.end()) {
1414+
return;
1415+
}
1416+
1417+
TFakeHiveTabletInfo& tabletInfo = it->second;
1418+
1419+
// Very similar to DeleteTablet but don't actually removes tablet
1420+
// Kill the tablet and don't restart it
1421+
TActorId bootstrapperActorId = tabletInfo.BootstrapperActorId;
1422+
ctx.Send(bootstrapperActorId, new TEvBootstrapper::TEvStandBy());
1423+
1424+
tabletInfo.State = ETabletState::Stopped;
1425+
}
1426+
1427+
void Handle(TEvHive::TEvStopTablet::TPtr &ev, const TActorContext &ctx) {
1428+
LOG_INFO_S(ctx, NKikimrServices::HIVE, "[" << TabletID() << "] TEvStopTablet, msg: " << ev->Get()->Record.ShortDebugString());
1429+
NKikimrHive::TEvStopTablet& rec = ev->Get()->Record;
1430+
Cerr << "FAKEHIVE " << TabletID() << " TEvStopTablet " << rec.ShortDebugString() << Endl;
1431+
StopTablet(rec.GetTabletID(), ctx);
1432+
ctx.Send(ev->Sender, new TEvHive::TEvStopTabletResult(NKikimrProto::OK, rec.GetTabletID()));
1433+
}
1434+
14031435
void Handle(TEvHive::TEvRequestHiveInfo::TPtr &ev, const TActorContext &ctx) {
14041436
LOG_INFO_S(ctx, NKikimrServices::HIVE, "[" << TabletID() << "] TEvRequestHiveInfo, msg: " << ev->Get()->Record.ShortDebugString());
14051437
const auto& record = ev->Get()->Record;
@@ -1451,15 +1483,15 @@ namespace NKikimr {
14511483

14521484
void Handle(TEvHive::TEvUpdateDomain::TPtr &ev, const TActorContext &ctx) {
14531485
LOG_INFO_S(ctx, NKikimrServices::HIVE, "[" << TabletID() << "] TEvUpdateDomain, msg: " << ev->Get()->Record.ShortDebugString());
1454-
1486+
14551487
const TSubDomainKey subdomainKey(ev->Get()->Record.GetDomainKey());
14561488
NHive::TDomainInfo& domainInfo = State->Domains[subdomainKey];
14571489
if (ev->Get()->Record.HasServerlessComputeResourcesMode()) {
14581490
domainInfo.ServerlessComputeResourcesMode = ev->Get()->Record.GetServerlessComputeResourcesMode();
14591491
} else {
14601492
domainInfo.ServerlessComputeResourcesMode.Clear();
14611493
}
1462-
1494+
14631495
auto response = std::make_unique<TEvHive::TEvUpdateDomainReply>();
14641496
response->Record.SetTxId(ev->Get()->Record.GetTxId());
14651497
response->Record.SetOrigin(TabletID());
@@ -1511,7 +1543,8 @@ namespace NKikimr {
15111543
tabletInfo.SetTabletID(tabletId);
15121544
if (info) {
15131545
tabletInfo.SetTabletType(info->Type);
1514-
tabletInfo.SetState(200); // THive::ReadyToWork
1546+
tabletInfo.SetState(ui32(info->State)); // THive::ETabletState::*
1547+
tabletInfo.MutableObjectDomain()->CopyFrom(info->ObjectDomain);
15151548

15161549
// TODO: fill other fields when needed
15171550
}

ydb/core/testlib/tablet_helpers.h

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -126,7 +126,7 @@ namespace NKikimr {
126126
: DomainKey(domainKey)
127127
{}
128128
};
129-
129+
130130
struct TEvRequestDomainInfoReply: public TEventLocal<TEvRequestDomainInfoReply, EvRequestDomainInfoReply> {
131131
NHive::TDomainInfo DomainInfo;
132132

@@ -137,10 +137,20 @@ namespace NKikimr {
137137

138138
};
139139

140+
141+
// partial mirror of NHive::ETabletState states from ydb/core/mind/hive/hive.h
142+
enum class ETabletState : ui64 {
143+
Unknown = 0, // THive::ETabletState::Unknown
144+
Stopped = 100, // THive::ETabletState::Stopped
145+
ReadyToWork = 200, // THive::ETabletState::ReadyToWork
146+
};
147+
140148
struct TFakeHiveTabletInfo {
141149
const TTabletTypes::EType Type;
142150
const ui64 TabletId;
143151
TActorId BootstrapperActorId;
152+
ETabletState State = ETabletState::Unknown;
153+
TSubDomainKey ObjectDomain; // what subdomain tablet belongs to
144154

145155
TChannelsBindings BoundChannels;
146156
ui32 ChannelsProfile;

ydb/core/tx/coordinator/mediator_queue.cpp

Lines changed: 12 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,16 @@ static constexpr size_t ConfirmedStepsToFlush = 2;
2020
// the number of rows as large transactions are problematic to commit.
2121
static constexpr size_t ConfirmedParticipantsToFlush = 10'000;
2222

23+
// Coordinator must reconnect with mediator as quickly as possible if connection is lost.
24+
// Retry policy prevents the reconnect loop from becoming too aggressive.
25+
static NTabletPipe::TClientRetryPolicy MediatorSyncRetryPolicy{
26+
.RetryLimitCount = std::numeric_limits<ui32>::max(),
27+
.MinRetryTime = TDuration::MilliSeconds(1),
28+
.MaxRetryTime = TDuration::MilliSeconds(10),
29+
.BackoffMultiplier = 2,
30+
.DoFirstRetryInstantly = true,
31+
};
32+
2333
void TMediatorStep::SerializeTo(TEvTxCoordinator::TEvCoordinatorStep *msg) const {
2434
for (const TTx &tx : Transactions) {
2535
NKikimrTx::TCoordinatorTransaction *x = msg->Record.AddTransactions();
@@ -50,6 +60,7 @@ class TTxCoordinatorMediatorQueue : public TActorBootstrapped<TTxCoordinatorMedi
5060
size_t ConfirmedParticipants = 0;
5161
size_t ConfirmedSteps = 0;
5262

63+
5364
void Die(const TActorContext &ctx) override {
5465
if (PipeClient) {
5566
NTabletPipe::CloseClient(ctx, PipeClient);
@@ -66,7 +77,7 @@ class TTxCoordinatorMediatorQueue : public TActorBootstrapped<TTxCoordinatorMedi
6677
PipeClient = TActorId();
6778
}
6879

69-
PipeClient = ctx.RegisterWithSameMailbox(NTabletPipe::CreateClient(ctx.SelfID, Mediator));
80+
PipeClient = ctx.RegisterWithSameMailbox(NTabletPipe::CreateClient(ctx.SelfID, Mediator, MediatorSyncRetryPolicy));
7081

7182
LOG_DEBUG_S(ctx, NKikimrServices::TX_COORDINATOR_MEDIATOR_QUEUE, "Actor# " << ctx.SelfID.ToString()
7283
<< " tablet# " << Coordinator << " SEND EvCoordinatorSync to# " << Mediator << " Mediator");

ydb/core/tx/schemeshard/schemeshard__clean_pathes.cpp

Lines changed: 14 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -224,6 +224,13 @@ struct TSchemeShard::TTxCleanDroppedSubDomains : public TTransactionBase<TScheme
224224
<< ", at schemeshard: "<< Self->TabletID());
225225
Self->PersistRemoveSubDomain(db, pathId);
226226
++RemovedCount;
227+
228+
// This is for tests, so that tests could wait for actual lifetime end of a subdomain.
229+
// It's kinda ok to reply from execute, and actually required for tests with reboots
230+
// (to not lose event on a tablet reboot).
231+
{
232+
ctx.Send(Self->SelfId(), new TEvPrivate::TEvTestNotifySubdomainCleanup(pathId));
233+
}
227234
} else {
228235
// Probably never happens, but better safe than sorry.
229236
++SkippedCount;
@@ -237,12 +244,13 @@ struct TSchemeShard::TTxCleanDroppedSubDomains : public TTransactionBase<TScheme
237244
Y_ABORT_UNLESS(Self->CleanDroppedSubDomainsInFly);
238245

239246
if (RemovedCount || SkippedCount) {
240-
LOG_NOTICE_S(ctx, NKikimrServices::FLAT_TX_SCHEMESHARD,
241-
"TTxCleanDroppedSubDomains Complete"
242-
<< ", done PersistRemoveSubDomain for " << RemovedCount << " paths"
243-
<< ", skipped " << SkippedCount
244-
<< ", left " << Self->CleanDroppedSubDomainsCandidates.size() << " candidates"
245-
<< ", at schemeshard: "<< Self->TabletID());
247+
LOG_NOTICE_S(ctx, NKikimrServices::FLAT_TX_SCHEMESHARD, "TTxCleanDroppedSubDomains Complete"
248+
<< ", done PersistRemoveSubDomain for " << RemovedCount << " subdomains"
249+
<< ", skipped " << SkippedCount
250+
<< ", left " << Self->CleanDroppedSubDomainsCandidates.size() << " candidates"
251+
<< ", at schemeshard: "<< Self->TabletID()
252+
);
253+
246254
}
247255

248256
if (!Self->CleanDroppedSubDomainsCandidates.empty()) {

ydb/core/tx/schemeshard/schemeshard__init.cpp

Lines changed: 40 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -674,6 +674,25 @@ struct TSchemeShard::TTxInit : public TTransactionBase<TSchemeShard> {
674674
return true;
675675
}
676676

677+
bool LoadSystemShardsToDelete(NIceDb::TNiceDb& db, TShardsToDeleteRows& shardsToDelete) const {
678+
{
679+
auto rowSet = db.Table<Schema::SystemShardsToDelete>().Range().Select();
680+
if (!rowSet.IsReady()) {
681+
return false;
682+
}
683+
while (!rowSet.EndOfSet()) {
684+
const auto shardIdx = Self->MakeLocalId(rowSet.GetValue<Schema::SystemShardsToDelete::ShardIdx>());
685+
shardsToDelete.emplace_back(shardIdx);
686+
687+
if (!rowSet.Next()) {
688+
return false;
689+
}
690+
}
691+
}
692+
693+
return true;
694+
}
695+
677696
typedef std::tuple<TOperationId, TShardIdx, TTxState::ETxState> TTxShardRec;
678697
typedef TVector<TTxShardRec> TTxShardsRows;
679698

@@ -3790,6 +3809,23 @@ struct TSchemeShard::TTxInit : public TTransactionBase<TSchemeShard> {
37903809
}
37913810
}
37923811

3812+
// Read system shards to delete
3813+
{
3814+
TShardsToDeleteRows shardsToDelete;
3815+
if (!LoadSystemShardsToDelete(db, shardsToDelete)) {
3816+
return false;
3817+
}
3818+
3819+
LOG_NOTICE_S(ctx, NKikimrServices::FLAT_TX_SCHEMESHARD,
3820+
"TTxInit for SystemShardToDelete"
3821+
<< ", read records: " << shardsToDelete.size()
3822+
<< ", at schemeshard: " << Self->TabletID());
3823+
3824+
for (auto& rec: shardsToDelete) {
3825+
OnComplete.DeleteSystemShard(std::get<0>(rec));
3826+
}
3827+
}
3828+
37933829
// Read backup settings
37943830
{
37953831
TBackupSettingsRows backupSettings;
@@ -3845,7 +3881,7 @@ struct TSchemeShard::TTxInit : public TTransactionBase<TSchemeShard> {
38453881
auto desc = tableInfo->BackupSettings.MutableTable();
38463882
Y_ABORT_UNLESS(ParseFromStringNoSizeLimit(*desc, tableDesc));
38473883
}
3848-
3884+
38493885
if (changefeedUnderlyingTopics) {
38503886
NKikimrSchemeOp::TChangefeedUnderlyingTopics wrapperOverTopics;
38513887
Y_ABORT_UNLESS(ParseFromStringNoSizeLimit(wrapperOverTopics, changefeedUnderlyingTopics));
@@ -4525,12 +4561,12 @@ struct TSchemeShard::TTxInit : public TTransactionBase<TSchemeShard> {
45254561
try {
45264562
fill(buildInfo);
45274563
} catch (const std::exception& exc) {
4528-
LOG_ERROR_S(ctx, NKikimrServices::BUILD_INDEX,
4564+
LOG_ERROR_S(ctx, NKikimrServices::BUILD_INDEX,
45294565
"Init " << stepName << " unhandled exception, id#" << buildInfo.Id
45304566
<< " " << TypeName(exc) << ": " << exc.what() << Endl
45314567
<< TBackTrace::FromCurrentException().PrintToString()
45324568
<< ", TIndexBuildInfo: " << buildInfo);
4533-
4569+
45344570
// in-memory volatile state:
45354571
buildInfo.IsBroken = true;
45364572
buildInfo.AddIssue(TStringBuilder() << "Init " << stepName << " unhandled exception " << exc.what());
@@ -4541,7 +4577,7 @@ struct TSchemeShard::TTxInit : public TTransactionBase<TSchemeShard> {
45414577
const auto* buildInfoPtr = Self->IndexBuilds.FindPtr(id);
45424578
Y_ASSERT(buildInfoPtr);
45434579
if (!buildInfoPtr) {
4544-
LOG_ERROR_S(ctx, NKikimrServices::BUILD_INDEX,
4580+
LOG_ERROR_S(ctx, NKikimrServices::BUILD_INDEX,
45454581
"Init " << stepName << " BuildInfo not found: id#" << id);
45464582
return;
45474583
}

ydb/core/tx/schemeshard/schemeshard__operation_drop_extsubdomain.cpp

Lines changed: 37 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -10,13 +10,46 @@ namespace {
1010
using namespace NKikimr;
1111
using namespace NSchemeShard;
1212

13-
class TDeletePrivateShards: public TDeleteParts {
13+
class TDeleteSubdomainSystemShards: public TSubOperationState {
14+
protected:
15+
const TOperationId OperationId;
16+
17+
TString DebugHint() const override {
18+
return TStringBuilder() << "TDeleteSubdomainSystemShards" << " opId# " << OperationId << " ";
19+
}
20+
1421
public:
15-
explicit TDeletePrivateShards(const TOperationId& id)
16-
: TDeleteParts(id, TTxState::Done)
22+
explicit TDeleteSubdomainSystemShards(const TOperationId& id)
23+
: OperationId(id)
1724
{
1825
IgnoreMessages(DebugHint(), AllIncomingEvents());
1926
}
27+
28+
bool ProgressState(TOperationContext& context) override {
29+
LOG_INFO_S(context.Ctx, NKikimrServices::FLAT_TX_SCHEMESHARD, "[" << context.SS->SelfTabletId() << "] " << DebugHint() << "ProgressState");
30+
31+
const auto* txState = context.SS->FindTx(OperationId);
32+
Y_ABORT_UNLESS(txState);
33+
Y_ABORT_UNLESS(txState->TxType == TTxState::TxForceDropExtSubDomain);
34+
35+
auto subdomain = context.SS->SubDomains.at(txState->TargetPathId);
36+
Y_ABORT_UNLESS(subdomain);
37+
38+
// Initiate asynchronous deletion of system shards
39+
if (subdomain->GetSharedHive()) {
40+
for (const auto& shard : txState->Shards) {
41+
context.OnComplete.DeleteShard(shard.Idx);
42+
}
43+
} else {
44+
for (const auto& shard : txState->Shards) {
45+
context.OnComplete.DeleteSystemShard(shard.Idx);
46+
}
47+
}
48+
49+
NIceDb::TNiceDb db(context.GetDB());
50+
context.SS->ChangeTxState(db, OperationId, TTxState::Done);
51+
return true;
52+
}
2053
};
2154

2255
class TDeleteExternalShards: public TSubOperationState {
@@ -237,7 +270,7 @@ class TDropExtSubdomain: public TSubOperation {
237270
case TTxState::DeleteExternalShards:
238271
return MakeHolder<TDeleteExternalShards>(OperationId);
239272
case TTxState::DeletePrivateShards:
240-
return MakeHolder<TDeletePrivateShards>(OperationId);
273+
return MakeHolder<TDeleteSubdomainSystemShards>(OperationId);
241274
case TTxState::Done:
242275
return MakeHolder<TDone>(OperationId);
243276
default:

0 commit comments

Comments
 (0)