mirror of
https://github.com/talent-plan/tinykv.git
synced 2025-01-13 13:50:43 +08:00
5e089a2cd1
Signed-off-by: Connor <zbk602423539@gmail.com> Co-authored-by: Nick Cameron <nrc@ncameron.org> Co-authored-by: linning <linningde25@gmail.com> Co-authored-by: YangKeao <keao.yang@yahoo.com> Co-authored-by: andylokandy <andylokandy@hotmail.com> Co-authored-by: Iosmanthus Teng <myosmanthustree@gmail.com>
401 lines
12 KiB
Go
401 lines
12 KiB
Go
// Copyright 2016 PingCAP, Inc.
|
|
//
|
|
// Licensed under the Apache License, Version 2.0 (the "License");
|
|
// you may not use this file except in compliance with the License.
|
|
// You may obtain a copy of the License at
|
|
//
|
|
// http://www.apache.org/licenses/LICENSE-2.0
|
|
//
|
|
// Unless required by applicable law or agreed to in writing, software
|
|
// distributed under the License is distributed on an "AS IS" BASIS,
|
|
// See the License for the specific language governing permissions and
|
|
// limitations under the License.
|
|
|
|
package member
|
|
|
|
import (
|
|
"context"
|
|
"fmt"
|
|
"math/rand"
|
|
"path"
|
|
"strconv"
|
|
"strings"
|
|
"sync/atomic"
|
|
"time"
|
|
|
|
"github.com/pingcap-incubator/tinykv/proto/pkg/schedulerpb"
|
|
"github.com/pingcap-incubator/tinykv/scheduler/pkg/etcdutil"
|
|
"github.com/pingcap-incubator/tinykv/scheduler/server/config"
|
|
"github.com/pingcap-incubator/tinykv/scheduler/server/kv"
|
|
"github.com/pingcap/log"
|
|
"github.com/pkg/errors"
|
|
"go.etcd.io/etcd/clientv3"
|
|
"go.etcd.io/etcd/embed"
|
|
"go.etcd.io/etcd/mvcc/mvccpb"
|
|
"go.uber.org/zap"
|
|
)
|
|
|
|
const (
|
|
// The timeout to wait transfer etcd leader to complete.
|
|
moveLeaderTimeout = 5 * time.Second
|
|
requestTimeout = etcdutil.DefaultRequestTimeout
|
|
slowRequestTime = etcdutil.DefaultSlowRequestTime
|
|
)
|
|
|
|
// Member is used for the election related logic.
|
|
type Member struct {
|
|
leader atomic.Value
|
|
// Etcd and cluster information.
|
|
etcd *embed.Etcd
|
|
client *clientv3.Client
|
|
id uint64 // etcd server id.
|
|
member *schedulerpb.Member // current PD's info.
|
|
rootPath string
|
|
// memberValue is the serialized string of `member`. It will be save in
|
|
// etcd leader key when the PD node is successfully elected as the leader
|
|
// of the cluster. Every write will use it to check leadership.
|
|
memberValue string
|
|
}
|
|
|
|
// NewMember create a new Member.
|
|
func NewMember(etcd *embed.Etcd, client *clientv3.Client, id uint64) *Member {
|
|
return &Member{
|
|
etcd: etcd,
|
|
client: client,
|
|
id: id,
|
|
}
|
|
}
|
|
|
|
// ID returns the unique etcd ID for this server in etcd cluster.
|
|
func (m *Member) ID() uint64 {
|
|
return m.id
|
|
}
|
|
|
|
// MemberValue returns the member value.
|
|
func (m *Member) MemberValue() string {
|
|
return m.memberValue
|
|
}
|
|
|
|
// Member returns the member.
|
|
func (m *Member) Member() *schedulerpb.Member {
|
|
return m.member
|
|
}
|
|
|
|
// Etcd returns etcd related information.
|
|
func (m *Member) Etcd() *embed.Etcd {
|
|
return m.etcd
|
|
}
|
|
|
|
// IsLeader returns whether the server is leader or not.
|
|
func (m *Member) IsLeader() bool {
|
|
// If server is not started. Both leaderID and ID could be 0.
|
|
return m.GetLeaderID() == m.ID()
|
|
}
|
|
|
|
// GetLeaderID returns current leader's member ID.
|
|
func (m *Member) GetLeaderID() uint64 {
|
|
return m.GetLeader().GetMemberId()
|
|
}
|
|
|
|
// GetLeader returns current leader of PD cluster.
|
|
func (m *Member) GetLeader() *schedulerpb.Member {
|
|
leader := m.leader.Load()
|
|
if leader == nil {
|
|
return nil
|
|
}
|
|
member := leader.(*schedulerpb.Member)
|
|
if member.GetMemberId() == 0 {
|
|
return nil
|
|
}
|
|
return member
|
|
}
|
|
|
|
// EnableLeader sets the member to leader.
|
|
func (m *Member) EnableLeader() {
|
|
m.leader.Store(m.member)
|
|
}
|
|
|
|
// DisableLeader reset the leader value.
|
|
func (m *Member) DisableLeader() {
|
|
m.leader.Store(&schedulerpb.Member{})
|
|
}
|
|
|
|
// GetLeaderPath returns the path of the leader.
|
|
func (m *Member) GetLeaderPath() string {
|
|
return path.Join(m.rootPath, "leader")
|
|
}
|
|
|
|
// CheckLeader checks returns true if it is needed to check later.
|
|
func (m *Member) CheckLeader(name string) (*schedulerpb.Member, int64, bool) {
|
|
if m.GetEtcdLeader() == 0 {
|
|
log.Error("no etcd leader, check leader later")
|
|
time.Sleep(200 * time.Millisecond)
|
|
return nil, 0, true
|
|
}
|
|
|
|
leader, rev, err := getLeader(m.client, m.GetLeaderPath())
|
|
if err != nil {
|
|
log.Error("get leader meet error", zap.Error(err))
|
|
time.Sleep(200 * time.Millisecond)
|
|
return nil, 0, true
|
|
}
|
|
if leader != nil {
|
|
if m.isSameLeader(leader) {
|
|
// oh, we are already leader, we may meet something wrong
|
|
// in previous CampaignLeader. we can delete and campaign again.
|
|
log.Warn("the leader has not changed, delete and campaign again", zap.Stringer("old-leader", leader))
|
|
if err = m.deleteLeaderKey(); err != nil {
|
|
log.Error("delete leader key meet error", zap.Error(err))
|
|
time.Sleep(200 * time.Millisecond)
|
|
return nil, 0, true
|
|
}
|
|
}
|
|
}
|
|
return leader, rev, false
|
|
}
|
|
|
|
// CheckPriority if the leader will be moved according to the priority.
|
|
func (m *Member) CheckPriority(ctx context.Context) {
|
|
etcdLeader := m.GetEtcdLeader()
|
|
if etcdLeader == m.ID() || etcdLeader == 0 {
|
|
return
|
|
}
|
|
myPriority, err := m.GetMemberLeaderPriority(m.ID())
|
|
if err != nil {
|
|
log.Error("failed to load leader priority", zap.Error(err))
|
|
return
|
|
}
|
|
leaderPriority, err := m.GetMemberLeaderPriority(etcdLeader)
|
|
if err != nil {
|
|
log.Error("failed to load etcd leader priority", zap.Error(err))
|
|
return
|
|
}
|
|
if myPriority > leaderPriority {
|
|
err := m.MoveEtcdLeader(ctx, etcdLeader, m.ID())
|
|
if err != nil {
|
|
log.Error("failed to transfer etcd leader", zap.Error(err))
|
|
} else {
|
|
log.Info("transfer etcd leader",
|
|
zap.Uint64("from", etcdLeader),
|
|
zap.Uint64("to", m.ID()))
|
|
}
|
|
}
|
|
}
|
|
|
|
// MoveEtcdLeader tries to transfer etcd leader.
|
|
func (m *Member) MoveEtcdLeader(ctx context.Context, old, new uint64) error {
|
|
moveCtx, cancel := context.WithTimeout(ctx, moveLeaderTimeout)
|
|
defer cancel()
|
|
return errors.WithStack(m.etcd.Server.MoveLeader(moveCtx, old, new))
|
|
}
|
|
|
|
// getLeader gets server leader from etcd.
|
|
func getLeader(c *clientv3.Client, leaderPath string) (*schedulerpb.Member, int64, error) {
|
|
leader := &schedulerpb.Member{}
|
|
ok, rev, err := etcdutil.GetProtoMsgWithModRev(c, leaderPath, leader)
|
|
if err != nil {
|
|
return nil, 0, err
|
|
}
|
|
if !ok {
|
|
return nil, 0, nil
|
|
}
|
|
|
|
return leader, rev, nil
|
|
}
|
|
|
|
// GetEtcdLeader returns the etcd leader ID.
|
|
func (m *Member) GetEtcdLeader() uint64 {
|
|
return m.etcd.Server.Lead()
|
|
}
|
|
|
|
func (m *Member) isSameLeader(leader *schedulerpb.Member) bool {
|
|
return leader.GetMemberId() == m.ID()
|
|
}
|
|
|
|
// MemberInfo initializes the member info.
|
|
func (m *Member) MemberInfo(cfg *config.Config, name string, rootPath string) {
|
|
leader := &schedulerpb.Member{
|
|
Name: name,
|
|
MemberId: m.ID(),
|
|
ClientUrls: strings.Split(cfg.AdvertiseClientUrls, ","),
|
|
PeerUrls: strings.Split(cfg.AdvertisePeerUrls, ","),
|
|
}
|
|
|
|
data, err := leader.Marshal()
|
|
if err != nil {
|
|
// can't fail, so panic here.
|
|
log.Fatal("marshal leader meet error", zap.Stringer("leader", leader), zap.Error(err))
|
|
}
|
|
m.member = leader
|
|
m.memberValue = string(data)
|
|
m.rootPath = rootPath
|
|
}
|
|
|
|
// CampaignLeader is used to campaign the leader.
|
|
func (m *Member) CampaignLeader(lease *LeaderLease, leaseTimeout int64) error {
|
|
err := lease.Grant(leaseTimeout)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
|
|
leaderKey := m.GetLeaderPath()
|
|
// The leader key must not exist, so the CreateRevision is 0.
|
|
resp, err := kv.NewSlowLogTxn(m.client).
|
|
If(clientv3.Compare(clientv3.CreateRevision(leaderKey), "=", 0)).
|
|
Then(clientv3.OpPut(leaderKey, m.memberValue, clientv3.WithLease(lease.ID))).
|
|
Commit()
|
|
if err != nil {
|
|
return errors.WithStack(err)
|
|
}
|
|
if !resp.Succeeded {
|
|
return errors.New("failed to campaign leader, other server may campaign ok")
|
|
}
|
|
return nil
|
|
}
|
|
|
|
// ResignLeader resigns current PD's leadership. If nextLeader is empty, all
|
|
// other pd-servers can campaign.
|
|
func (m *Member) ResignLeader(ctx context.Context, from string, nextLeader string) error {
|
|
log.Info("try to resign leader to next leader", zap.String("from", from), zap.String("to", nextLeader))
|
|
// Determine next leaders.
|
|
var leaderIDs []uint64
|
|
res, err := etcdutil.ListEtcdMembers(m.client)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
for _, member := range res.Members {
|
|
if (nextLeader == "" && member.ID != m.id) || (nextLeader != "" && member.Name == nextLeader) {
|
|
leaderIDs = append(leaderIDs, member.GetID())
|
|
}
|
|
}
|
|
if len(leaderIDs) == 0 {
|
|
return errors.New("no valid pd to transfer leader")
|
|
}
|
|
nextLeaderID := leaderIDs[rand.Intn(len(leaderIDs))]
|
|
return m.MoveEtcdLeader(ctx, m.ID(), nextLeaderID)
|
|
}
|
|
|
|
// LeaderTxn returns txn() with a leader comparison to guarantee that
|
|
// the transaction can be executed only if the server is leader.
|
|
func (m *Member) LeaderTxn(cs ...clientv3.Cmp) clientv3.Txn {
|
|
txn := kv.NewSlowLogTxn(m.client)
|
|
return txn.If(append(cs, m.leaderCmp())...)
|
|
}
|
|
|
|
func (m *Member) getMemberLeaderPriorityPath(id uint64) string {
|
|
return path.Join(m.rootPath, fmt.Sprintf("member/%d/leader_priority", id))
|
|
}
|
|
|
|
// SetMemberLeaderPriority saves a member's priority to be elected as the etcd leader.
|
|
func (m *Member) SetMemberLeaderPriority(id uint64, priority int) error {
|
|
key := m.getMemberLeaderPriorityPath(id)
|
|
res, err := m.LeaderTxn().Then(clientv3.OpPut(key, strconv.Itoa(priority))).Commit()
|
|
if err != nil {
|
|
return errors.WithStack(err)
|
|
}
|
|
if !res.Succeeded {
|
|
return errors.New("save leader priority failed, maybe not leader")
|
|
}
|
|
return nil
|
|
}
|
|
|
|
// DeleteMemberLeaderPriority removes a member's priority config.
|
|
func (m *Member) DeleteMemberLeaderPriority(id uint64) error {
|
|
key := m.getMemberLeaderPriorityPath(id)
|
|
res, err := m.LeaderTxn().Then(clientv3.OpDelete(key)).Commit()
|
|
if err != nil {
|
|
return errors.WithStack(err)
|
|
}
|
|
if !res.Succeeded {
|
|
return errors.New("delete leader priority failed, maybe not leader")
|
|
}
|
|
return nil
|
|
}
|
|
|
|
// GetMemberLeaderPriority loads a member's priority to be elected as the etcd leader.
|
|
func (m *Member) GetMemberLeaderPriority(id uint64) (int, error) {
|
|
key := m.getMemberLeaderPriorityPath(id)
|
|
res, err := etcdutil.EtcdKVGet(m.client, key)
|
|
if err != nil {
|
|
return 0, err
|
|
}
|
|
if len(res.Kvs) == 0 {
|
|
return 0, nil
|
|
}
|
|
priority, err := strconv.ParseInt(string(res.Kvs[0].Value), 10, 32)
|
|
if err != nil {
|
|
return 0, errors.WithStack(err)
|
|
}
|
|
return int(priority), nil
|
|
}
|
|
|
|
func (m *Member) deleteLeaderKey() error {
|
|
// delete leader itself and let others start a new election again.
|
|
leaderKey := m.GetLeaderPath()
|
|
resp, err := m.LeaderTxn().Then(clientv3.OpDelete(leaderKey)).Commit()
|
|
if err != nil {
|
|
return errors.WithStack(err)
|
|
}
|
|
if !resp.Succeeded {
|
|
return errors.New("resign leader failed, we are not leader already")
|
|
}
|
|
|
|
return nil
|
|
}
|
|
|
|
func (m *Member) leaderCmp() clientv3.Cmp {
|
|
return clientv3.Compare(clientv3.Value(m.GetLeaderPath()), "=", m.memberValue)
|
|
}
|
|
|
|
// WatchLeader is used to watch the changes of the leader.
|
|
func (m *Member) WatchLeader(serverCtx context.Context, leader *schedulerpb.Member, revision int64) {
|
|
m.leader.Store(leader)
|
|
defer m.leader.Store(&schedulerpb.Member{})
|
|
|
|
watcher := clientv3.NewWatcher(m.client)
|
|
defer watcher.Close()
|
|
|
|
ctx, cancel := context.WithCancel(serverCtx)
|
|
defer cancel()
|
|
|
|
// The revision is the revision of last modification on this key.
|
|
// If the revision is compacted, will meet required revision has been compacted error.
|
|
// In this case, use the compact revision to re-watch the key.
|
|
for {
|
|
rch := watcher.Watch(ctx, m.GetLeaderPath(), clientv3.WithRev(revision))
|
|
for wresp := range rch {
|
|
// meet compacted error, use the compact revision.
|
|
if wresp.CompactRevision != 0 {
|
|
log.Warn("required revision has been compacted, use the compact revision",
|
|
zap.Int64("required-revision", revision),
|
|
zap.Int64("compact-revision", wresp.CompactRevision))
|
|
revision = wresp.CompactRevision
|
|
break
|
|
}
|
|
if wresp.Canceled {
|
|
log.Error("leader watcher is canceled with", zap.Int64("revision", revision), zap.Error(wresp.Err()))
|
|
return
|
|
}
|
|
|
|
for _, ev := range wresp.Events {
|
|
if ev.Type == mvccpb.DELETE {
|
|
log.Info("leader is deleted")
|
|
return
|
|
}
|
|
}
|
|
}
|
|
|
|
select {
|
|
case <-ctx.Done():
|
|
// server closed, return
|
|
return
|
|
default:
|
|
}
|
|
}
|
|
}
|
|
|
|
// Close gracefully shuts down all servers/listeners.
|
|
func (m *Member) Close() {
|
|
m.Etcd().Close()
|
|
}
|