[libc++] Use bit field for checking if string is in long or short mode

This makes the code a bit simpler and (I think) removes the undefined behaviour from the normal string layout.

Reviewed By: ldionne, Mordante, #libc

Spies: labath, dblaikie, JDevlieghere, krytarowski, jgorbe, jingham, saugustine, arichardson, libcxx-commits

Differential Revision: https://reviews.llvm.org/D123580

NOKEYCHECK=True
GitOrigin-RevId: 29c8c070a1770fc510ccad3be753f6f50336f8cc
diff --git a/include/string b/include/string
index 9fd19d2..bd1c430 100644
--- a/include/string
+++ b/include/string
@@ -532,6 +532,8 @@
 #include <__utility/auto_cast.h>
 #include <__utility/move.h>
 #include <__utility/swap.h>
+#include <__utility/unreachable.h>
+#include <climits>
 #include <compare>
 #include <cstdio>  // EOF
 #include <cstdlib>
@@ -539,6 +541,7 @@
 #include <initializer_list>
 #include <iosfwd>
 #include <iterator>
+#include <limits>
 #include <memory>
 #include <stdexcept>
 #include <string_view>
@@ -674,17 +677,10 @@
     {
         pointer   __data_;
         size_type __size_;
-        size_type __cap_;
+        size_type __cap_ : sizeof(size_type) * CHAR_BIT - 1;
+        size_type __is_long_ : 1;
     };
 
-#ifdef _LIBCPP_BIG_ENDIAN
-    static const size_type __short_mask = 0x01;
-    static const size_type __long_mask  = 0x1ul;
-#else  // _LIBCPP_BIG_ENDIAN
-    static const size_type __short_mask = 0x80;
-    static const size_type __long_mask  = ~(size_type(~0) >> 1);
-#endif // _LIBCPP_BIG_ENDIAN
-
     enum {__min_cap = (sizeof(__long) - 1)/sizeof(value_type) > 2 ?
                       (sizeof(__long) - 1)/sizeof(value_type) : 2};
 
@@ -692,26 +688,46 @@
     {
         value_type __data_[__min_cap];
         unsigned char __padding[sizeof(value_type) - 1];
-        unsigned char __size_;
+        unsigned char __size_ : 7;
+        unsigned char __is_long_ : 1;
     };
 
+// The __endian_factor is required because the field we use to store the size
+// (either size_type or unsigned char depending on long/short) has one fewer
+// bit than it would if it were not a bitfield.
+//
+// If the LSB is used to store the short-flag in the short string representation,
+// we have to multiply the size by two when it is stored and divide it by two when
+// it is loaded to make sure that we always store an even number. In the long string
+// representation, we can ignore this because we can assume that we always allocate
+// an even amount of value_types.
+//
+// If the MSB is used for the short-flag, the max_size() is numeric_limits<size_type>::max() / 2.
+// This does not impact the short string representation, since we never need the MSB
+// for representing the size of a short string anyway.
+
+#ifdef _LIBCPP_BIG_ENDIAN
+    static const size_type __endian_factor = 2;
 #else
+    static const size_type __endian_factor = 1;
+#endif
+
+#else // _LIBCPP_ABI_ALTERNATE_STRING_LAYOUT
+
+#ifdef _LIBCPP_BIG_ENDIAN
+    static const size_type __endian_factor = 1;
+#else
+    static const size_type __endian_factor = 2;
+#endif
 
     struct __long
     {
-        size_type __cap_;
+        size_type __is_long_ : 1;
+        size_type __cap_ : sizeof(size_type) * CHAR_BIT - 1;
         size_type __size_;
         pointer   __data_;
     };
 
-#ifdef _LIBCPP_BIG_ENDIAN
-    static const size_type __short_mask = 0x80;
-    static const size_type __long_mask  = ~(size_type(~0) >> 1);
-#else  // _LIBCPP_BIG_ENDIAN
-    static const size_type __short_mask = 0x01;
-    static const size_type __long_mask  = 0x1ul;
-#endif // _LIBCPP_BIG_ENDIAN
-
     enum {__min_cap = (sizeof(__long) - 1)/sizeof(value_type) > 2 ?
                       (sizeof(__long) - 1)/sizeof(value_type) : 2};
 
@@ -719,7 +735,10 @@
     {
         union
         {
-            unsigned char __size_;
+            struct {
+                unsigned char __is_long_ : 1;
+                unsigned char __size_ : 7;
+            };
             value_type __lx;
         };
         value_type __data_[__min_cap];
@@ -1426,8 +1445,9 @@
     _LIBCPP_INLINE_VISIBILITY void __shrink_or_extend(size_type __target_capacity);
 
     _LIBCPP_INLINE_VISIBILITY
-    bool __is_long() const _NOEXCEPT
-        {return bool(__r_.first().__s.__size_ & __short_mask);}
+    bool __is_long() const _NOEXCEPT {
+        return __r_.first().__s.__is_long_;
+    }
 
 #if _LIBCPP_DEBUG_LEVEL == 2
 
@@ -1474,43 +1494,18 @@
     _LIBCPP_HIDE_FROM_ABI allocator_type& __alloc() _NOEXCEPT { return __r_.second(); }
     _LIBCPP_HIDE_FROM_ABI const allocator_type& __alloc() const _NOEXCEPT { return __r_.second(); }
 
-#ifdef _LIBCPP_ABI_ALTERNATE_STRING_LAYOUT
+    _LIBCPP_INLINE_VISIBILITY
+    void __set_short_size(size_type __s) _NOEXCEPT {
+        _LIBCPP_ASSERT(__s < __min_cap, "__s should never be greater than or equal to the short string capacity");
+        __r_.first().__s.__size_ = __s;
+        __r_.first().__s.__is_long_ = false;
+    }
 
     _LIBCPP_INLINE_VISIBILITY
-    void __set_short_size(size_type __s) _NOEXCEPT
-#   ifdef _LIBCPP_BIG_ENDIAN
-        {__r_.first().__s.__size_ = (unsigned char)(__s << 1);}
-#   else
-        {__r_.first().__s.__size_ = (unsigned char)(__s);}
-#   endif
-
-    _LIBCPP_INLINE_VISIBILITY
-    size_type __get_short_size() const _NOEXCEPT
-#   ifdef _LIBCPP_BIG_ENDIAN
-        {return __r_.first().__s.__size_ >> 1;}
-#   else
-        {return __r_.first().__s.__size_;}
-#   endif
-
-#else  // _LIBCPP_ABI_ALTERNATE_STRING_LAYOUT
-
-    _LIBCPP_INLINE_VISIBILITY
-    void __set_short_size(size_type __s) _NOEXCEPT
-#   ifdef _LIBCPP_BIG_ENDIAN
-        {__r_.first().__s.__size_ = (unsigned char)(__s);}
-#   else
-        {__r_.first().__s.__size_ = (unsigned char)(__s << 1);}
-#   endif
-
-    _LIBCPP_INLINE_VISIBILITY
-    size_type __get_short_size() const _NOEXCEPT
-#   ifdef _LIBCPP_BIG_ENDIAN
-        {return __r_.first().__s.__size_;}
-#   else
-        {return __r_.first().__s.__size_ >> 1;}
-#   endif
-
-#endif // _LIBCPP_ABI_ALTERNATE_STRING_LAYOUT
+    size_type __get_short_size() const _NOEXCEPT {
+        _LIBCPP_ASSERT(!__r_.first().__s.__is_long_, "String has to be short when trying to get the short size");
+        return __r_.first().__s.__size_;
+    }
 
     _LIBCPP_INLINE_VISIBILITY
     void __set_long_size(size_type __s) _NOEXCEPT
@@ -1523,11 +1518,15 @@
         {if (__is_long()) __set_long_size(__s); else __set_short_size(__s);}
 
     _LIBCPP_INLINE_VISIBILITY
-    void __set_long_cap(size_type __s) _NOEXCEPT
-        {__r_.first().__l.__cap_  = __long_mask | __s;}
+    void __set_long_cap(size_type __s) _NOEXCEPT {
+        __r_.first().__l.__cap_ = __s / __endian_factor;
+        __r_.first().__l.__is_long_ = true;
+    }
+
     _LIBCPP_INLINE_VISIBILITY
-    size_type __get_long_cap() const _NOEXCEPT
-        {return __r_.first().__l.__cap_ & size_type(~__long_mask);}
+    size_type __get_long_cap() const _NOEXCEPT {
+        return __r_.first().__l.__cap_ * __endian_factor;
+    }
 
     _LIBCPP_INLINE_VISIBILITY
     void __set_long_pointer(pointer __p) _NOEXCEPT
@@ -3225,11 +3224,12 @@
 basic_string<_CharT, _Traits, _Allocator>::max_size() const _NOEXCEPT
 {
     size_type __m = __alloc_traits::max_size(__alloc());
-#ifdef _LIBCPP_BIG_ENDIAN
-    return (__m <= ~__long_mask ? __m : __m/2) - __alignment;
-#else
-    return __m - __alignment;
-#endif
+    if (__m <= std::numeric_limits<size_type>::max() / 2) {
+        return __m - __alignment;
+    } else {
+        bool __uses_lsb = __endian_factor == 2;
+        return __uses_lsb ? __m - __alignment : (__m / 2) - __alignment;
+    }
 }
 
 template <class _CharT, class _Traits, class _Allocator>