废话不多说直接看源码，以下基于gcc13版本shared_ptr实现

std::shared_ptr结构

//<bits/shared_ptr.h>
  template<typename _Tp>
    class shared_ptr : public __shared_ptr<_Tp>
    {
      //...
      template<typename _Yp, typename = _Constructible<_Yp*>>
    explicit
    shared_ptr(_Yp* __p) : __shared_ptr<_Tp>(__p) { }
    }

//<bits/shared_ptr_base.h>
  // Forward declarations.
  template<typename _Tp, _Lock_policy _Lp = __default_lock_policy>
    class __shared_ptr;

  template<typename _Tp, _Lock_policy _Lp>
    class __shared_ptr
    : public __shared_ptr_access<_Tp, _Lp>
    {
    public:
      using element_type = typename remove_extent<_Tp>::type;

    private:
           //...
      element_type*       _M_ptr;         // Contained pointer.
      __shared_count<_Lp>  _M_refcount;    // Reference counter.
    };

  // Define operator* and operator-> for shared_ptr<T>.
  template<typename _Tp, _Lock_policy _Lp,
       bool = is_array<_Tp>::value, bool = is_void<_Tp>::value>
    class __shared_ptr_access
    {
    }

以上贴出部分类型模板声明，可以看到大致的类继承结构及成员，也就是shared_ptr< T >继承自__shared_ptr<T, __default_lock_policy>，__shared_ptr拥有实际指针element_type*和成员引用计数__shared_count<__default_lock_policy>，__shared_ptr_access暂且不看，先看看这个__shared_count

__shared_count

同在<bits/shared_ptr_base.h>

template<_Lock_policy _Lp = __default_lock_policy>
  class _Sp_counted_base
  : public _Mutex_base<_Lp>
  {
  public:
    _Sp_counted_base() noexcept
    : _M_use_count(1), _M_weak_count(1) { }


  //...
  private:
    _Atomic_word  _M_use_count;     // #shared
    _Atomic_word  _M_weak_count;    // #weak + (#shared != 0)
  };

// Counted ptr with no deleter or allocator support
template<typename _Ptr, _Lock_policy _Lp>
  class _Sp_counted_ptr final : public _Sp_counted_base<_Lp>
  {
  public:
    explicit
    _Sp_counted_ptr(_Ptr __p) noexcept
    : _M_ptr(__p) { }

    virtual void
    _M_dispose() noexcept
    { delete _M_ptr; }

    virtual void
    _M_destroy() noexcept
    { delete this; }

    virtual void*
    _M_get_deleter(const std::type_info&) noexcept
    { return nullptr; }

    _Sp_counted_ptr(const _Sp_counted_ptr&) = delete;
    _Sp_counted_ptr& operator=(const _Sp_counted_ptr&) = delete;

  private:
    _Ptr             _M_ptr;
  };

template<_Lock_policy _Lp>
  class __shared_count
  {
  //...
      private:
    _Sp_counted_base<_Lp>*  _M_pi;
  };

_Atomic_word来自于<bits/atomic_word.h>具体类型取决于架构和系统，在我测试的主机上x86_64-redhat下是typedef int _Atomic_word。那么主要看__shared_count和_M_pi这个_Sp_counted_base指针的计数操作,而_M_pi这个基类指针在构造时会因构造方式有所不同，以make_shared举例：

make_shared 中的__shared_count

template<typename _Tp, typename... _Args>
  inline shared_ptr<_NonArray<_Tp>>
  make_shared(_Args&&... __args)
  {
    using _Alloc = allocator<void>;
    _Alloc __a;
    return shared_ptr<_Tp>(_Sp_alloc_shared_tag<_Alloc>{__a},
               std::forward<_Args>(__args)...);
  }

template<typename _Tp>
  class shared_ptr : public __shared_ptr<_Tp>
  {
  private:
    // This constructor is non-standard, it is used by allocate_shared.
    template<typename _Alloc, typename... _Args>
  shared_ptr(_Sp_alloc_shared_tag<_Alloc> __tag, _Args&&... __args)
  : __shared_ptr<_Tp>(__tag, std::forward<_Args>(__args)...)
  { }
  }

template<typename _Tp, _Lock_policy _Lp>
  class __shared_ptr
  : public __shared_ptr_access<_Tp, _Lp>
  {
  protected:
    // This constructor is non-standard, it is used by allocate_shared.
    template<typename _Alloc, typename... _Args>
  __shared_ptr(_Sp_alloc_shared_tag<_Alloc> __tag, _Args&&... __args)
  : _M_ptr(), _M_refcount(_M_ptr, __tag, std::forward<_Args>(__args)...)
  { _M_enable_shared_from_this_with(_M_ptr); }
  }

template<_Lock_policy _Lp>
  class __shared_count
  {
    template<typename _Tp, typename _Alloc, typename... _Args>
  __shared_count(_Tp*& __p, _Sp_alloc_shared_tag<_Alloc> __a,
             _Args&&... __args)
  {
    typedef _Sp_counted_ptr_inplace<_Tp, _Alloc, _Lp> _Sp_cp_type;
    typename _Sp_cp_type::__allocator_type __a2(__a._M_a);
    auto __guard = std::__allocate_guarded(__a2);
    _Sp_cp_type* __mem = __guard.get();
    auto __pi = ::new (__mem)
      _Sp_cp_type(__a._M_a, std::forward<_Args>(__args)...);
    __guard = nullptr;
    _M_pi = __pi;
    __p = __pi->_M_ptr();
  }
  }

也就是在make_shared构造时走了一个经由tag标签的特殊构造，在这个构造下_M_pi指向_Sp_counted_base的实现子类_Sp_counted_ptr_inplace（具体来说是_Sp_counted_ptr_inplace<T, alloc, lockpolicy>），那么再来看看这个_Sp_counted_ptr_inplace：

template<typename _Tp, typename _Alloc, _Lock_policy _Lp>
  class _Sp_counted_ptr_inplace final : public _Sp_counted_base<_Lp>
  {
    class _Impl : _Sp_ebo_helper<0, _Alloc>
    {
  typedef _Sp_ebo_helper<0, _Alloc>    _A_base;

    public:
  explicit _Impl(_Alloc __a) noexcept : _A_base(__a) { }

  _Alloc& _M_alloc() noexcept { return _A_base::_S_get(*this); }

  __gnu_cxx::__aligned_buffer<_Tp> _M_storage;
    };

    // Alloc parameter is not a reference so doesn't alias anything in __args
    template<typename... _Args>
  _Sp_counted_ptr_inplace(_Alloc __a, _Args&&... __args)
  : _M_impl(__a)
  {
    // _GLIBCXX_RESOLVE_LIB_DEFECTS
    // 2070.  allocate_shared should use allocator_traits<A>::construct
    allocator_traits<_Alloc>::construct(__a, _M_ptr(),
        std::forward<_Args>(__args)...); // might throw
  }

    virtual void
    _M_dispose() noexcept
    {
  allocator_traits<_Alloc>::destroy(_M_impl._M_alloc(), _M_ptr());
    }

    // Override because the allocator needs to know the dynamic type
    virtual void
    _M_destroy() noexcept
    {
  __allocator_type __a(_M_impl._M_alloc());
  __allocated_ptr<__allocator_type> __guard_ptr{ __a, this };
  this->~_Sp_counted_ptr_inplace();
    }

  private:
    _Impl _M_impl;
  };

常看stl代码的人对ebo和align_buffer肯定不陌生，这里不再赘述。虽然看上去有点奇怪，这个_Sp_counted_ptr_inplace实际上是把make_shared的T对象存放在了自己对象的_M_impl内，同时重载了_Sp_counted_base虚函数_M_dispose和_M_destroy用于销毁该对象，那么再回来看下引用计数增加和减少时的操作。

引用计数操作

增加引用计数，shared_ptr和__shared_ptr的复制构造都使用默认编译器构造，也就是使用__shared_count复制构造：

  __shared_count(const __shared_count& __r) noexcept
  : _M_pi(__r._M_pi)
  {
if (_M_pi != nullptr)
  _M_pi->_M_add_ref_copy();
  }

直接复制_Sp_counted_base*，并调用其基类非虚函数_M_add_ref_copy：

// Increment the use count (used when the count is greater than zero).
void
_M_add_ref_copy()
{ __gnu_cxx::__atomic_add_dispatch(&_M_use_count, 1); }

inline void
__attribute__ ((__always_inline__))
__atomic_add_dispatch(_Atomic_word* __mem, int __val)
{
  if (__is_single_threaded())
    __atomic_add_single(__mem, __val);
  else
    __atomic_add(__mem, __val);
}

也就是原子性的保证_Sp_counted_base中的_M_use_count的增加，这里也可以顺带看下弱引用计数的增加，同样是原子性：

// Increment the weak count.
void
_M_weak_add_ref() noexcept
{ __gnu_cxx::__atomic_add_dispatch(&_M_weak_count, 1); }

减少应用计数，shared_ptr、__shared_ptr使用默认析构，工作最终一样落到了__shared_count的析构上：

  ~__shared_count() noexcept
  {
if (_M_pi != nullptr)
  _M_pi->_M_release();
  }

同样是基类非虚函数_M_release，这里根据_Lock_policy特化了三个版本_M_release，这里终于见到了这个lock_policy，分别是
<ext/concurrence.h>

namespace __gnu_cxx _GLIBCXX_VISIBILITY(default)
{
_GLIBCXX_BEGIN_NAMESPACE_VERSION
  // Available locking policies:
  // _S_single    single-threaded code that doesn't need to be locked.
  // _S_mutex     multi-threaded code that requires additional support
  //              from gthr.h or abstraction layers in concurrence.h.
  // _S_atomic    multi-threaded code using atomic operations.
  enum _Lock_policy { _S_single, _S_mutex, _S_atomic };

  // Compile time constant that indicates prefered locking policy in
  // the current configuration.
  _GLIBCXX17_INLINE const _Lock_policy __default_lock_policy =
#ifndef __GTHREADS
  _S_single;
#elif defined _GLIBCXX_HAVE_ATOMIC_LOCK_POLICY
  _S_atomic;
#else
  _S_mutex;
#endif
}

  template<>
    inline void
    _Sp_counted_base<_S_single>::_M_release() noexcept
    {
      if (--_M_use_count == 0)
        {
          _M_dispose();
          if (--_M_weak_count == 0)
            _M_destroy();
        }
    }

  template<>
    inline void
    _Sp_counted_base<_S_mutex>::_M_release() noexcept
    {
      // Be race-detector-friendly.  For more info see bits/c++config.
      _GLIBCXX_SYNCHRONIZATION_HAPPENS_BEFORE(&_M_use_count);
      if (__gnu_cxx::__exchange_and_add_dispatch(&_M_use_count, -1) == 1)
    {
      _M_release_last_use();
    }
    }

  template<>
    inline void
    _Sp_counted_base<_S_atomic>::_M_release() noexcept
    {
      _GLIBCXX_SYNCHRONIZATION_HAPPENS_BEFORE(&_M_use_count);
#if ! _GLIBCXX_TSAN
      constexpr bool __lock_free
    = __atomic_always_lock_free(sizeof(long long), 0)
    && __atomic_always_lock_free(sizeof(_Atomic_word), 0);
      constexpr bool __double_word
    = sizeof(long long) == 2 * sizeof(_Atomic_word);
      // The ref-count members follow the vptr, so are aligned to
      // alignof(void*).
      constexpr bool __aligned = __alignof(long long) <= alignof(void*);
      if _GLIBCXX17_CONSTEXPR (__lock_free && __double_word && __aligned)
    {
      constexpr int __wordbits = __CHAR_BIT__ * sizeof(_Atomic_word);
      constexpr int __shiftbits = __double_word ? __wordbits : 0;
      constexpr long long __unique_ref = 1LL + (1LL << __shiftbits);
      auto __both_counts = reinterpret_cast<long long*>(&_M_use_count);


      _GLIBCXX_SYNCHRONIZATION_HAPPENS_BEFORE(&_M_weak_count);
      if (__atomic_load_n(__both_counts, __ATOMIC_ACQUIRE) == __unique_ref)
        {
          // Both counts are 1, so there are no weak references and
          // we are releasing the last strong reference. No other
          // threads can observe the effects of this _M_release()
          // call (e.g. calling use_count()) without a data race.
          _M_weak_count = _M_use_count = 0;
          _GLIBCXX_SYNCHRONIZATION_HAPPENS_AFTER(&_M_use_count);
          _GLIBCXX_SYNCHRONIZATION_HAPPENS_AFTER(&_M_weak_count);
          _M_dispose();
          _M_destroy();
          return;
        }
      if (__gnu_cxx::__exchange_and_add_dispatch(&_M_use_count, -1) == 1)
        [[__unlikely__]]
        {
          _M_release_last_use_cold();
          return;
        }
    }
      else
#endif
      if (__gnu_cxx::__exchange_and_add_dispatch(&_M_use_count, -1) == 1)
    {
      _M_release_last_use();
    }
    }

逻辑以_S_single的最为简单，非原子性减少强引用计数，如果减少为0则调用虚函数_M_dispose析构实际对象本身，非原子性减少弱引用计数，如果减少为0则调用虚函数_M_destroy析构引用计数本身。
当然现在一般默认的__default_lock_policy是_S_atomic，也就是最长的这个版本，_GLIBCXX_SYNCHRONIZATION_HAPPENS_宏的解释在c++config中：

// Macros for race detectors.
// _GLIBCXX_SYNCHRONIZATION_HAPPENS_BEFORE(A) and
// _GLIBCXX_SYNCHRONIZATION_HAPPENS_AFTER(A) should be used to explain
// atomic (lock-free) synchronization to race detectors:
// the race detector will infer a happens-before arc from the former to the
// latter when they share the same argument pointer.
//
// The most frequent use case for these macros (and the only case in the
// current implementation of the library) is atomic reference counting:
//   void _M_remove_reference()
//   {
//     _GLIBCXX_SYNCHRONIZATION_HAPPENS_BEFORE(&this->_M_refcount);
//     if (__gnu_cxx::__exchange_and_add_dispatch(&this->_M_refcount, -1) <= 0)
//       {
//         _GLIBCXX_SYNCHRONIZATION_HAPPENS_AFTER(&this->_M_refcount);
//         _M_destroy(__a);
//       }
//   }
// The annotations in this example tell the race detector that all memory
// accesses occurred when the refcount was positive do not race with
// memory accesses which occurred after the refcount became zero.

意思是这一对宏是用于在无锁场景下给race detector检测用，告知检测器在BEFORE时间点前与AFTER时间点后对引用计数的访问都是安全的。
再回到_S_atomic的_M_release中，主要逻辑为判断当前是可原子性操作环境后，对强弱引用计数判断，将强引用地址开始的两个整型地址转成长整形，用以单语句原子性判断两个计数是否皆为1，如果皆为1则同样的调用_M_dispose和_M_destroy。如果仅仅是强引用计数为1，则调用非内联函数_M_release_last_use_cold，再调用_M_release_last_use

  // Called by _M_release() when the use count reaches zero.
  void
  _M_release_last_use() noexcept
  {
_GLIBCXX_SYNCHRONIZATION_HAPPENS_AFTER(&_M_use_count);
_M_dispose();
// There must be a memory barrier between dispose() and destroy()
// to ensure that the effects of dispose() are observed in the
// thread that runs destroy().
// See http://gcc.gnu.org/ml/libstdc++/2005-11/msg00136.html
if (_Mutex_base<_Lp>::_S_need_barriers)
  {
    __atomic_thread_fence (__ATOMIC_ACQ_REL);
  }

// Be race-detector-friendly.  For more info see bits/c++config.
_GLIBCXX_SYNCHRONIZATION_HAPPENS_BEFORE(&_M_weak_count);
if (__gnu_cxx::__exchange_and_add_dispatch(&_M_weak_count,
                       -1) == 1)
  {
    _GLIBCXX_SYNCHRONIZATION_HAPPENS_AFTER(&_M_weak_count);
    _M_destroy();
  }
  }

  // As above, but 'noinline' to reduce code size on the cold path.
  __attribute__((__noinline__))
  void
  _M_release_last_use_cold() noexcept
  { _M_release_last_use(); }

_Mutex_base<_Lp>::_S_need_barriers仅当lockpolicy为mutex时才生效，当弱引用计数降为0后调用_M_destory销毁引用计数对象。

UML结构图

结语

综上std::shared_ptr在多线程环境中多个shard_ptr指向同一个对象时，他们的析构无需使用锁即可保证释放正确，不会出现多次析构或者多次free的情况。

写到这里倒是突然想起来，shared_ptr的释放线程安全性之前其实也是了解和测试过的，只是时间久了就忘了。看来这拳不打手生，很多东西还是得在使用中加深印象。

Joshua's Blog

[gcc13] STL源码解析 std::shared_ptr

std::shared_ptr结构

__shared_count

make_shared 中的__shared_count

引用计数操作

UML结构图

结语