//---------------------------------------------------------------------------// // Copyright (c) 2015 Jakub Szuppe // // Distributed under the Boost Software License, Version 1.0 // See accompanying file LICENSE_1_0.txt or copy at // http://www.boost.org/LICENSE_1_0.txt // // See http://boostorg.github.com/compute for more information. //---------------------------------------------------------------------------// #ifndef BOOST_COMPUTE_ALGORITHM_DETAIL_MERGE_SORT_ON_CPU_HPP #define BOOST_COMPUTE_ALGORITHM_DETAIL_MERGE_SORT_ON_CPU_HPP #include #include #include #include #include #include #include namespace boost { namespace compute { namespace detail { template inline void merge_blocks(KeyIterator keys_first, ValueIterator values_first, KeyIterator keys_result, ValueIterator values_result, Compare compare, size_t count, const size_t block_size, const bool sort_by_key, command_queue &queue) { (void) values_result; (void) values_first; meta_kernel k("merge_sort_on_cpu_merge_blocks"); size_t count_arg = k.add_arg("count"); size_t block_size_arg = k.add_arg("block_size"); k << k.decl("b1_start") << " = get_global_id(0) * block_size * 2;\n" << k.decl("b1_end") << " = min(count, b1_start + block_size);\n" << k.decl("b2_start") << " = min(count, b1_start + block_size);\n" << k.decl("b2_end") << " = min(count, b2_start + block_size);\n" << k.decl("result_idx") << " = b1_start;\n" << // merging block 1 and block 2 (stable) "while(b1_start < b1_end && b2_start < b2_end){\n" << " if( " << compare(keys_first[k.var("b2_start")], keys_first[k.var("b1_start")]) << "){\n" << " " << keys_result[k.var("result_idx")] << " = " << keys_first[k.var("b2_start")] << ";\n"; if(sort_by_key){ k << " " << values_result[k.var("result_idx")] << " = " << values_first[k.var("b2_start")] << ";\n"; } k << " b2_start++;\n" << " }\n" << " else {\n" << " " << keys_result[k.var("result_idx")] << " = " << keys_first[k.var("b1_start")] << ";\n"; if(sort_by_key){ k << " " << values_result[k.var("result_idx")] << " = " << values_first[k.var("b1_start")] << ";\n"; } k << " b1_start++;\n" << " }\n" << " result_idx++;\n" << "}\n" << "while(b1_start < b1_end){\n" << " " << keys_result[k.var("result_idx")] << " = " << keys_first[k.var("b1_start")] << ";\n"; if(sort_by_key){ k << " " << values_result[k.var("result_idx")] << " = " << values_first[k.var("b1_start")] << ";\n"; } k << " b1_start++;\n" << " result_idx++;\n" << "}\n" << "while(b2_start < b2_end){\n" << " " << keys_result[k.var("result_idx")] << " = " << keys_first[k.var("b2_start")] << ";\n"; if(sort_by_key){ k << " " << values_result[k.var("result_idx")] << " = " << values_first[k.var("b2_start")] << ";\n"; } k << " b2_start++;\n" << " result_idx++;\n" << "}\n"; const context &context = queue.get_context(); ::boost::compute::kernel kernel = k.compile(context); kernel.set_arg(count_arg, static_cast(count)); kernel.set_arg(block_size_arg, static_cast(block_size)); const size_t global_size = static_cast( std::ceil(float(count) / (2 * block_size)) ); queue.enqueue_1d_range_kernel(kernel, 0, global_size, 0); } template inline void merge_blocks(Iterator first, Iterator result, Compare compare, size_t count, const size_t block_size, const bool sort_by_key, command_queue &queue) { // dummy iterator as it's not sort by key Iterator dummy; merge_blocks(first, dummy, result, dummy, compare, count, block_size, false, queue); } template inline void dispatch_merge_blocks(Iterator first, Iterator result, Compare compare, size_t count, const size_t block_size, const size_t input_size_threshold, const size_t blocks_no_threshold, command_queue &queue) { const size_t blocks_no = static_cast( std::ceil(float(count) / block_size) ); // merge with merge path should used only for the large arrays and at the // end of merging part when there are only a few big blocks left to be merged if(blocks_no <= blocks_no_threshold && count >= input_size_threshold){ Iterator last = first + count; for(size_t i = 0; i < count; i+= 2*block_size) { Iterator first1 = (std::min)(first + i, last); Iterator last1 = (std::min)(first1 + block_size, last); Iterator first2 = last1; Iterator last2 = (std::min)(first2 + block_size, last); Iterator block_result = (std::min)(result + i, result + count); merge_with_merge_path(first1, last1, first2, last2, block_result, compare, queue); } } else { merge_blocks(first, result, compare, count, block_size, false, queue); } } template inline void block_insertion_sort(KeyIterator keys_first, ValueIterator values_first, Compare compare, const size_t count, const size_t block_size, const bool sort_by_key, command_queue &queue) { (void) values_first; typedef typename std::iterator_traits::value_type K; typedef typename std::iterator_traits::value_type T; meta_kernel k("merge_sort_on_cpu_block_insertion_sort"); size_t count_arg = k.add_arg("count"); size_t block_size_arg = k.add_arg("block_size"); k << k.decl("start") << " = get_global_id(0) * block_size;\n" << k.decl("end") << " = min(count, start + block_size);\n" << // block insertion sort (stable) "for(uint i = start+1; i < end; i++){\n" << " " << k.decl("key") << " = " << keys_first[k.var("i")] << ";\n"; if(sort_by_key){ k << " " << k.decl("value") << " = " << values_first[k.var("i")] << ";\n"; } k << " uint pos = i;\n" << " while(pos > start && " << compare(k.var("key"), keys_first[k.var("pos-1")]) << "){\n" << " " << keys_first[k.var("pos")] << " = " << keys_first[k.var("pos-1")] << ";\n"; if(sort_by_key){ k << " " << values_first[k.var("pos")] << " = " << values_first[k.var("pos-1")] << ";\n"; } k << " pos--;\n" << " }\n" << " " << keys_first[k.var("pos")] << " = key;\n"; if(sort_by_key) { k << " " << values_first[k.var("pos")] << " = value;\n"; } k << "}\n"; // block insertion sort const context &context = queue.get_context(); ::boost::compute::kernel kernel = k.compile(context); kernel.set_arg(count_arg, static_cast(count)); kernel.set_arg(block_size_arg, static_cast(block_size)); const size_t global_size = static_cast(std::ceil(float(count) / block_size)); queue.enqueue_1d_range_kernel(kernel, 0, global_size, 0); } template inline void block_insertion_sort(Iterator first, Compare compare, const size_t count, const size_t block_size, command_queue &queue) { // dummy iterator as it's not sort by key Iterator dummy; block_insertion_sort(first, dummy, compare, count, block_size, false, queue); } // This sort is stable. template inline void merge_sort_on_cpu(Iterator first, Iterator last, Compare compare, command_queue &queue) { typedef typename std::iterator_traits::value_type value_type; size_t count = iterator_range_size(first, last); if(count < 2){ return; } // for small input size only insertion sort is performed else if(count <= 512){ block_insertion_sort(first, compare, count, count, queue); return; } const context &context = queue.get_context(); const device &device = queue.get_device(); // loading parameters std::string cache_key = std::string("__boost_merge_sort_on_cpu_") + type_name(); boost::shared_ptr parameters = detail::parameter_cache::get_global_cache(device); // When there is merge_with_path_blocks_no_threshold or less blocks left to // merge AND input size is merge_with_merge_path_input_size_threshold or more // merge_with_merge_path() algorithm is used to merge sorted blocks; // otherwise merge_blocks() is used. const size_t merge_with_path_blocks_no_threshold = parameters->get(cache_key, "merge_with_merge_path_blocks_no_threshold", 8); const size_t merge_with_path_input_size_threshold = parameters->get(cache_key, "merge_with_merge_path_input_size_threshold", 2097152); const size_t block_size = parameters->get(cache_key, "insertion_sort_block_size", 64); block_insertion_sort(first, compare, count, block_size, queue); // temporary buffer for merge result vector temp(count, context); bool result_in_temporary_buffer = false; for(size_t i = block_size; i < count; i *= 2){ result_in_temporary_buffer = !result_in_temporary_buffer; if(result_in_temporary_buffer) { dispatch_merge_blocks(first, temp.begin(), compare, count, i, merge_with_path_input_size_threshold, merge_with_path_blocks_no_threshold, queue); } else { dispatch_merge_blocks(temp.begin(), first, compare, count, i, merge_with_path_input_size_threshold, merge_with_path_blocks_no_threshold, queue); } } if(result_in_temporary_buffer) { copy(temp.begin(), temp.end(), first, queue); } } // This sort is stable. template inline void merge_sort_by_key_on_cpu(KeyIterator keys_first, KeyIterator keys_last, ValueIterator values_first, Compare compare, command_queue &queue) { typedef typename std::iterator_traits::value_type key_type; typedef typename std::iterator_traits::value_type value_type; size_t count = iterator_range_size(keys_first, keys_last); if(count < 2){ return; } // for small input size only insertion sort is performed else if(count <= 512){ block_insertion_sort(keys_first, values_first, compare, count, count, true, queue); return; } const context &context = queue.get_context(); const device &device = queue.get_device(); // loading parameters std::string cache_key = std::string("__boost_merge_sort_by_key_on_cpu_") + type_name() + "_with_" + type_name(); boost::shared_ptr parameters = detail::parameter_cache::get_global_cache(device); const size_t block_size = parameters->get(cache_key, "insertion_sort_by_key_block_size", 64); block_insertion_sort(keys_first, values_first, compare, count, block_size, true, queue); // temporary buffer for merge results vector values_temp(count, context); vector keys_temp(count, context); bool result_in_temporary_buffer = false; for(size_t i = block_size; i < count; i *= 2){ result_in_temporary_buffer = !result_in_temporary_buffer; if(result_in_temporary_buffer) { merge_blocks(keys_first, values_first, keys_temp.begin(), values_temp.begin(), compare, count, i, true, queue); } else { merge_blocks(keys_temp.begin(), values_temp.begin(), keys_first, values_first, compare, count, i, true, queue); } } if(result_in_temporary_buffer) { copy(keys_temp.begin(), keys_temp.end(), keys_first, queue); copy(values_temp.begin(), values_temp.end(), values_first, queue); } } } // end detail namespace } // end compute namespace } // end boost namespace #endif // BOOST_COMPUTE_ALGORITHM_DETAIL_MERGE_SORT_ON_CPU_HPP