Partially inline basic_string copy constructor in UNSTABLE

    Summary:
    This is a recommit of https://reviews.llvm.org/D73223 where the added function accidentally ended up inside an idef block.

    This change splits the copy constructor up inlining short initialization, and explicitly outlining long initialization into __init_copy_ctor_external() which is the externally instantiated slow path.

    For unstable ABI, this has the following changes:

    remove basic_string(const basic_string&)
    remove basic_string(const basic_string&, const Allocator&)
    add __init_copy_ctor_external(const value_type*, size_type)
    Quick local benchmark for Copy:

    Master
    ```
    ---------------------------------------------------------------
    Benchmark                    Time             CPU   Iterations
    ---------------------------------------------------------------
    BM_StringCopy_Empty       3.50 ns         3.51 ns    199326720
    BM_StringCopy_Small       3.50 ns         3.51 ns    199510016
    BM_StringCopy_Large       15.7 ns         15.7 ns     45230080
    BM_StringCopy_Huge        1503 ns         1503 ns       464896
    ```
    With this change
    ```
    ---------------------------------------------------------------
    Benchmark                    Time             CPU   Iterations
    ---------------------------------------------------------------
    BM_StringCopy_Empty       1.99 ns         2.00 ns    356471808
    BM_StringCopy_Small       3.29 ns         3.30 ns    203425792
    BM_StringCopy_Large       13.3 ns         13.3 ns     52948992
    BM_StringCopy_Huge        1472 ns         1472 ns       475136
    ```

    Subscribers: libcxx-commits

    Tags: #libc

    Differential Revision: https://reviews.llvm.org/D75639

Cr-Mirrored-From: https://chromium.googlesource.com/external/github.com/llvm/llvm-project
Cr-Mirrored-Commit: b019c5c0372eb08800327efb5e7955ce918b75d1
diff --git a/include/string b/include/string
index a35c7b1..a60ffec 100644
--- a/include/string
+++ b/include/string
@@ -1549,6 +1549,16 @@
     inline
     void __init(size_type __n, value_type __c);
 
+    // Slow path for the (inlined) copy constructor for 'long' strings.
+    // Always externally instantiated and not inlined.
+    // Requires that __s is zero terminated.
+    // The main reason for this function to exist is because for unstable, we
+    // want to allow inlining of the copy constructor. However, we don't want
+    // to call the __init() functions as those are marked as inline which may
+    // result in over-aggressive inlining by the compiler, where our aim is
+    // to only inline the fast path code directly in the ctor.
+    void __init_copy_ctor_external(const value_type* __s, size_type __sz);
+
     template <class _InputIterator>
     inline
     _EnableIf
@@ -1861,7 +1871,9 @@
     if (!__str.__is_long())
         __r_.first().__r = __str.__r_.first().__r;
     else
-        __init(_VSTD::__to_address(__str.__get_long_pointer()), __str.__get_long_size());
+        __init_copy_ctor_external(_VSTD::__to_address(__str.__get_long_pointer()),
+                                  __str.__get_long_size());
+
 #if _LIBCPP_DEBUG_LEVEL >= 2
     __get_db()->__insert_c(this);
 #endif
@@ -1875,12 +1887,32 @@
     if (!__str.__is_long())
         __r_.first().__r = __str.__r_.first().__r;
     else
-        __init(_VSTD::__to_address(__str.__get_long_pointer()), __str.__get_long_size());
+        __init_copy_ctor_external(_VSTD::__to_address(__str.__get_long_pointer()),
+                                  __str.__get_long_size());
 #if _LIBCPP_DEBUG_LEVEL >= 2
     __get_db()->__insert_c(this);
 #endif
 }
 
+template <class _CharT, class _Traits, class _Allocator>
+void basic_string<_CharT, _Traits, _Allocator>::__init_copy_ctor_external(
+    const value_type* __s, size_type __sz) {
+  pointer __p;
+  if (__sz < __min_cap) {
+    __p = __get_short_pointer();
+    __set_short_size(__sz);
+  } else {
+    if (__sz > max_size())
+      this->__throw_length_error();
+    size_t __cap = __recommend(__sz);
+    __p = __alloc_traits::allocate(__alloc(), __cap + 1);
+    __set_long_pointer(__p);
+    __set_long_cap(__cap + 1);
+    __set_long_size(__sz);
+  }
+  traits_type::copy(_VSTD::__to_address(__p), __s, __sz + 1);
+}
+
 #ifndef _LIBCPP_CXX03_LANG
 
 template <class _CharT, class _Traits, class _Allocator>