multiway_mergesort.h - OpenGrok cross reference for /netbsd/src/external/gpl3/gcc/dist/libstdc++-v3/include/parallel/multiway_mergesort.h

// -*- C++ -*-

// Copyright (C) 2007-2022 Free Software Foundation, Inc.
//
// This file is part of the GNU ISO C++ Library.  This library is free
// software; you can redistribute it and/or modify it under the terms
// of the GNU General Public License as published by the Free Software
// Foundation; either version 3, or (at your option) any later
// version.

// This library is distributed in the hope that it will be useful, but
// WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
// General Public License for more details.

// Under Section 7 of GPL version 3, you are granted additional
// permissions described in the GCC Runtime Library Exception, version
// 3.1, as published by the Free Software Foundation.

// You should have received a copy of the GNU General Public License and
// a copy of the GCC Runtime Library Exception along with this program;
// see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
// <http://www.gnu.org/licenses/>.

/** @file parallel/multiway_mergesort.h
 *  @brief Parallel multiway merge sort.
 *  This file is a GNU parallel extension to the Standard C++ Library.
 */

// Written by Johannes Singler.

#ifndef _GLIBCXX_PARALLEL_MULTIWAY_MERGESORT_H
#define _GLIBCXX_PARALLEL_MULTIWAY_MERGESORT_H 1

#include <vector>

#include <parallel/basic_iterator.h>
#include <bits/stl_algo.h>
#include <parallel/parallel.h>
#include <parallel/multiway_merge.h>

namespace __gnu_parallel
{
  /** @brief Subsequence description. */
  template<typename _DifferenceTp>
    struct _Piece
    {
      typedef _DifferenceTp _DifferenceType;

      /** @brief Begin of subsequence. */
      _DifferenceType _M_begin;

      /** @brief End of subsequence. */
      _DifferenceType _M_end;
    };

  /** @brief Data accessed by all threads.
   *
   *  PMWMS = parallel multiway mergesort */
  template<typename _RAIter>
    struct _PMWMSSortingData
    {
      typedef std::iterator_traits<_RAIter> _TraitsType;
      typedef typename _TraitsType::value_type _ValueType;
      typedef typename _TraitsType::difference_type _DifferenceType;

      /** @brief Number of threads involved. */
      _ThreadIndex _M_num_threads;

      /** @brief Input __begin. */
      _RAIter _M_source;

      /** @brief Start indices, per thread. */
      _DifferenceType* _M_starts;

      /** @brief Storage in which to sort. */
      _ValueType** _M_temporary;

      /** @brief Samples. */
      _ValueType* _M_samples;

      /** @brief Offsets to add to the found positions. */
      _DifferenceType* _M_offsets;

      /** @brief Pieces of data to merge @c [thread][__sequence] */
      std::vector<_Piece<_DifferenceType> >* _M_pieces;
  };

  /**
   *  @brief Select _M_samples from a sequence.
   *  @param __sd Pointer to algorithm data. _Result will be placed in
   *  @c __sd->_M_samples.
   *  @param __num_samples Number of _M_samples to select.
   */
  template<typename _RAIter, typename _DifferenceTp>
    void
    __determine_samples(_PMWMSSortingData<_RAIter>* __sd,
                              _DifferenceTp __num_samples)
    {
      typedef std::iterator_traits<_RAIter> _TraitsType;
      typedef typename _TraitsType::value_type _ValueType;
      typedef _DifferenceTp _DifferenceType;

      _ThreadIndex __iam = omp_get_thread_num();

      _DifferenceType* __es = new _DifferenceType[__num_samples + 2];

      __equally_split(__sd->_M_starts[__iam + 1] - __sd->_M_starts[__iam],
                          __num_samples + 1, __es);

      for (_DifferenceType __i = 0; __i < __num_samples; ++__i)
          ::new(&(__sd->_M_samples[__iam * __num_samples + __i]))
              _ValueType(__sd->_M_source[__sd->_M_starts[__iam]
                                               + __es[__i + 1]]);

      delete[] __es;
    }

  /** @brief Split consistently. */
  template<bool __exact, typename _RAIter,
             typename _Compare, typename _SortingPlacesIterator>
    struct _SplitConsistently
    { };

  /** @brief Split by exact splitting. */
  template<typename _RAIter, typename _Compare,
             typename _SortingPlacesIterator>
    struct _SplitConsistently<true, _RAIter, _Compare, _SortingPlacesIterator>
    {
      void
      operator()(const _ThreadIndex __iam,
                     _PMWMSSortingData<_RAIter>* __sd,
                     _Compare& __comp,
                     const typename
                     std::iterator_traits<_RAIter>::difference_type
                     __num_samples) const
      {
#       pragma omp barrier

          std::vector<std::pair<_SortingPlacesIterator,
                                _SortingPlacesIterator> >
            __seqs(__sd->_M_num_threads);
          for (_ThreadIndex __s = 0; __s < __sd->_M_num_threads; __s++)
            __seqs[__s] = std::make_pair(__sd->_M_temporary[__s],
                                               __sd->_M_temporary[__s]
                                               + (__sd->_M_starts[__s + 1]
                                                    - __sd->_M_starts[__s]));

          std::vector<_SortingPlacesIterator> __offsets(__sd->_M_num_threads);

          // if not last thread
          if (__iam < __sd->_M_num_threads - 1)
            multiseq_partition(__seqs.begin(), __seqs.end(),
                                   __sd->_M_starts[__iam + 1], __offsets.begin(),
                                   __comp);

          for (_ThreadIndex __seq = 0; __seq < __sd->_M_num_threads; __seq++)
            {
              // for each sequence
              if (__iam < (__sd->_M_num_threads - 1))
                __sd->_M_pieces[__iam][__seq]._M_end
                    = __offsets[__seq] - __seqs[__seq].first;
              else
                // very end of this sequence
                __sd->_M_pieces[__iam][__seq]._M_end =
                    __sd->_M_starts[__seq + 1] - __sd->_M_starts[__seq];
            }

#       pragma omp barrier

          for (_ThreadIndex __seq = 0; __seq < __sd->_M_num_threads; __seq++)
            {
              // For each sequence.
              if (__iam > 0)
                __sd->_M_pieces[__iam][__seq]._M_begin =
                    __sd->_M_pieces[__iam - 1][__seq]._M_end;
              else
                // Absolute beginning.
                __sd->_M_pieces[__iam][__seq]._M_begin = 0;
            }
      }
  };

  /** @brief Split by sampling. */
  template<typename _RAIter, typename _Compare,
             typename _SortingPlacesIterator>
    struct _SplitConsistently<false, _RAIter, _Compare, _SortingPlacesIterator>
    {
      void
      operator()(const _ThreadIndex __iam,
                     _PMWMSSortingData<_RAIter>* __sd,
                     _Compare& __comp,
                     const typename
                     std::iterator_traits<_RAIter>::difference_type
                     __num_samples) const
      {
          typedef std::iterator_traits<_RAIter> _TraitsType;
          typedef typename _TraitsType::value_type _ValueType;
          typedef typename _TraitsType::difference_type _DifferenceType;

          __determine_samples(__sd, __num_samples);

#       pragma omp barrier

#       pragma omp single
          __gnu_sequential::sort(__sd->_M_samples,
                                     __sd->_M_samples
                                     + (__num_samples * __sd->_M_num_threads),
                                     __comp);

#       pragma omp barrier

          for (_ThreadIndex __s = 0; __s < __sd->_M_num_threads; ++__s)
            {
              // For each sequence.
              if (__num_samples * __iam > 0)
                __sd->_M_pieces[__iam][__s]._M_begin =
                std::lower_bound(__sd->_M_temporary[__s],
                                         __sd->_M_temporary[__s]
                                         + (__sd->_M_starts[__s + 1]
                                            - __sd->_M_starts[__s]),
                                         __sd->_M_samples[__num_samples * __iam],
                                         __comp)
                - __sd->_M_temporary[__s];
              else
                // Absolute beginning.
                __sd->_M_pieces[__iam][__s]._M_begin = 0;

              if ((__num_samples * (__iam + 1)) <
                    (__num_samples * __sd->_M_num_threads))
                __sd->_M_pieces[__iam][__s]._M_end =
                std::lower_bound(__sd->_M_temporary[__s],
                                         __sd->_M_temporary[__s]
                                         + (__sd->_M_starts[__s + 1]
                                            - __sd->_M_starts[__s]),
                                         __sd->_M_samples[__num_samples * (__iam + 1)],
                                         __comp)
                - __sd->_M_temporary[__s];
              else
                // Absolute end.
                __sd->_M_pieces[__iam][__s]._M_end = (__sd->_M_starts[__s + 1]
                                                                - __sd->_M_starts[__s]);
            }
      }
  };

  template<bool __stable, typename _RAIter, typename _Compare>
    struct __possibly_stable_sort
    { };

  template<typename _RAIter, typename _Compare>
    struct __possibly_stable_sort<true, _RAIter, _Compare>
    {
      void operator()(const _RAIter& __begin,
                          const _RAIter& __end, _Compare& __comp) const
      { __gnu_sequential::stable_sort(__begin, __end, __comp); }
    };

  template<typename _RAIter, typename _Compare>
    struct __possibly_stable_sort<false, _RAIter, _Compare>
    {
      void operator()(const _RAIter __begin,
                          const _RAIter __end, _Compare& __comp) const
      { __gnu_sequential::sort(__begin, __end, __comp); }
    };

  template<bool __stable, typename _Seq_RAIter,
             typename _RAIter, typename _Compare,
             typename _DiffType>
    struct __possibly_stable_multiway_merge
    { };

  template<typename _Seq_RAIter, typename _RAIter,
             typename _Compare, typename _DiffType>
    struct __possibly_stable_multiway_merge<true, _Seq_RAIter,
                                                      _RAIter, _Compare, _DiffType>
    {
      void operator()(const _Seq_RAIter& __seqs_begin,
                          const _Seq_RAIter& __seqs_end,
                          const _RAIter& __target,
                          _Compare& __comp,
                          _DiffType __length_am) const
      { stable_multiway_merge(__seqs_begin, __seqs_end, __target,
                                    __length_am, __comp, sequential_tag()); }
    };

  template<typename _Seq_RAIter, typename _RAIter,
             typename _Compare, typename _DiffType>
    struct __possibly_stable_multiway_merge<false, _Seq_RAIter,
                                                      _RAIter, _Compare, _DiffType>
    {
      void operator()(const _Seq_RAIter& __seqs_begin,
                      const _Seq_RAIter& __seqs_end,
                      const _RAIter& __target,
                      _Compare& __comp,
                      _DiffType __length_am) const
      { multiway_merge(__seqs_begin, __seqs_end, __target, __length_am,
                           __comp, sequential_tag()); }
    };

  /** @brief PMWMS code executed by each thread.
   *  @param __sd Pointer to algorithm data.
   *  @param __comp Comparator.
   */
  template<bool __stable, bool __exact, typename _RAIter,
             typename _Compare>
    void
    parallel_sort_mwms_pu(_PMWMSSortingData<_RAIter>* __sd,
                                _Compare& __comp)
    {
      typedef std::iterator_traits<_RAIter> _TraitsType;
      typedef typename _TraitsType::value_type _ValueType;
      typedef typename _TraitsType::difference_type _DifferenceType;

      _ThreadIndex __iam = omp_get_thread_num();

      // Length of this thread's chunk, before merging.
      _DifferenceType __length_local =
          __sd->_M_starts[__iam + 1] - __sd->_M_starts[__iam];

      // Sort in temporary storage, leave space for sentinel.

      typedef _ValueType* _SortingPlacesIterator;

      __sd->_M_temporary[__iam] =
        static_cast<_ValueType*>(::operator new(sizeof(_ValueType)
                                                            * (__length_local + 1)));

      // Copy there.
      std::uninitialized_copy(__sd->_M_source + __sd->_M_starts[__iam],
                                    __sd->_M_source + __sd->_M_starts[__iam]
                                    + __length_local,
                                    __sd->_M_temporary[__iam]);

      __possibly_stable_sort<__stable, _SortingPlacesIterator, _Compare>()
        (__sd->_M_temporary[__iam],
           __sd->_M_temporary[__iam] + __length_local,
         __comp);

      // Invariant: locally sorted subsequence in sd->_M_temporary[__iam],
      // __sd->_M_temporary[__iam] + __length_local.

      // No barrier here: Synchronization is done by the splitting routine.

      _DifferenceType __num_samples =
        _Settings::get().sort_mwms_oversampling * __sd->_M_num_threads - 1;
      _SplitConsistently<__exact, _RAIter, _Compare, _SortingPlacesIterator>()
        (__iam, __sd, __comp, __num_samples);

      // Offset from __target __begin, __length after merging.
      _DifferenceType __offset = 0, __length_am = 0;
      for (_ThreadIndex __s = 0; __s < __sd->_M_num_threads; __s++)
          {
            __length_am += (__sd->_M_pieces[__iam][__s]._M_end
                                - __sd->_M_pieces[__iam][__s]._M_begin);
            __offset += __sd->_M_pieces[__iam][__s]._M_begin;
          }

      typedef std::vector<
        std::pair<_SortingPlacesIterator, _SortingPlacesIterator> >
        _SeqVector;
      _SeqVector __seqs(__sd->_M_num_threads);

      for (_ThreadIndex __s = 0; __s < __sd->_M_num_threads; ++__s)
          {
            __seqs[__s] =
              std::make_pair(__sd->_M_temporary[__s]
                                 + __sd->_M_pieces[__iam][__s]._M_begin,
                                 __sd->_M_temporary[__s]
                                 + __sd->_M_pieces[__iam][__s]._M_end);
          }

      __possibly_stable_multiway_merge<
        __stable, typename _SeqVector::iterator,
          _RAIter, _Compare, _DifferenceType>()(__seqs.begin(), __seqs.end(),
                                             __sd->_M_source + __offset, __comp,
                                             __length_am);

#     pragma omp barrier

      for (_DifferenceType __i = 0; __i < __length_local; ++__i)
          __sd->_M_temporary[__iam][__i].~_ValueType();
      ::operator delete(__sd->_M_temporary[__iam]);
    }

  /** @brief PMWMS main call.
   *  @param __begin Begin iterator of sequence.
   *  @param __end End iterator of sequence.
   *  @param __comp Comparator.
   *  @param __num_threads Number of threads to use.
   */
  template<bool __stable, bool __exact, typename _RAIter,
           typename _Compare>
    void
    parallel_sort_mwms(_RAIter __begin, _RAIter __end,
                           _Compare __comp,
                           _ThreadIndex __num_threads)
    {
      _GLIBCXX_CALL(__end - __begin)

      typedef std::iterator_traits<_RAIter> _TraitsType;
      typedef typename _TraitsType::value_type _ValueType;
      typedef typename _TraitsType::difference_type _DifferenceType;

      _DifferenceType __n = __end - __begin;

      if (__n <= 1)
          return;

      // at least one element per thread
      if (__num_threads > __n)
          __num_threads = static_cast<_ThreadIndex>(__n);

      // shared variables
      _PMWMSSortingData<_RAIter> __sd;
      _DifferenceType* __starts;
      _DifferenceType __size;

#     pragma omp parallel num_threads(__num_threads)
      {
        __num_threads = omp_get_num_threads(); //no more threads than requested

#       pragma omp single
          {
            __sd._M_num_threads = __num_threads;
            __sd._M_source = __begin;

            __sd._M_temporary = new _ValueType*[__num_threads];

            if (!__exact)
              {
                __size =
                    (_Settings::get().sort_mwms_oversampling * __num_threads - 1)
                    * __num_threads;
                __sd._M_samples = static_cast<_ValueType*>
                    (::operator new(__size * sizeof(_ValueType)));
              }
            else
              __sd._M_samples = 0;

            __sd._M_offsets = new _DifferenceType[__num_threads - 1];
            __sd._M_pieces
              = new std::vector<_Piece<_DifferenceType> >[__num_threads];
            for (_ThreadIndex __s = 0; __s < __num_threads; ++__s)
              __sd._M_pieces[__s].resize(__num_threads);
            __starts = __sd._M_starts = new _DifferenceType[__num_threads + 1];

            _DifferenceType __chunk_length = __n / __num_threads;
            _DifferenceType __split = __n % __num_threads;
            _DifferenceType __pos = 0;
            for (_ThreadIndex __i = 0; __i < __num_threads; ++__i)
              {
                __starts[__i] = __pos;
                __pos += ((__i < __split)
                              ? (__chunk_length + 1) : __chunk_length);
              }
            __starts[__num_threads] = __pos;
          } //single

        // Now sort in parallel.
        parallel_sort_mwms_pu<__stable, __exact>(&__sd, __comp);
      } //parallel

      delete[] __starts;
      delete[] __sd._M_temporary;

      if (!__exact)
          {
            for (_DifferenceType __i = 0; __i < __size; ++__i)
              __sd._M_samples[__i].~_ValueType();
            ::operator delete(__sd._M_samples);
          }

      delete[] __sd._M_offsets;
      delete[] __sd._M_pieces;
    }

} //namespace __gnu_parallel

#endif /* _GLIBCXX_PARALLEL_MULTIWAY_MERGESORT_H */