//---------------------------------------------------------------------------// // Copyright (c) 2016 Jakub Szuppe // // Distributed under the Boost Software License, Version 1.0 // See accompanying file LICENSE_1_0.txt or copy at // http://www.boost.org/LICENSE_1_0.txt // // See http://boostorg.github.com/compute for more information. //---------------------------------------------------------------------------// #ifndef BOOST_COMPUTE_ALGORITHM_DETAIL_REDUCE_ON_CPU_HPP #define BOOST_COMPUTE_ALGORITHM_DETAIL_REDUCE_ON_CPU_HPP #include #include #include #include #include #include #include #include #include namespace boost { namespace compute { namespace detail { template inline void reduce_on_cpu(InputIterator first, InputIterator last, OutputIterator result, BinaryFunction function, command_queue &queue) { typedef typename std::iterator_traits::value_type T; typedef typename ::boost::compute::result_of::type result_type; const device &device = queue.get_device(); const uint_ compute_units = queue.get_device().compute_units(); boost::shared_ptr parameters = detail::parameter_cache::get_global_cache(device); std::string cache_key = "__boost_reduce_cpu_" + boost::lexical_cast(sizeof(T)); // for inputs smaller than serial_reduce_threshold // serial_reduce algorithm is used uint_ serial_reduce_threshold = parameters->get(cache_key, "serial_reduce_threshold", 16384 * sizeof(T)); serial_reduce_threshold = (std::max)(serial_reduce_threshold, uint_(compute_units)); const context &context = queue.get_context(); size_t count = detail::iterator_range_size(first, last); if(count == 0){ return; } else if(count < serial_reduce_threshold) { return serial_reduce(first, last, result, function, queue); } meta_kernel k("reduce_on_cpu"); buffer output(context, sizeof(result_type) * compute_units); size_t count_arg = k.add_arg("count"); size_t output_arg = k.add_arg(memory_object::global_memory, "output"); k << "uint block = " << "(uint)ceil(((float)count)/get_global_size(0));\n" << "uint index = get_global_id(0) * block;\n" << "uint end = min(count, index + block);\n" << k.decl("result") << " = " << first[k.var("index")] << ";\n" << "index++;\n" << "while(index < end){\n" << "result = " << function(k.var("result"), first[k.var("index")]) << ";\n" << "index++;\n" << "}\n" << "output[get_global_id(0)] = result;\n"; size_t global_work_size = compute_units; kernel kernel = k.compile(context); // reduction to global_work_size elements kernel.set_arg(count_arg, static_cast(count)); kernel.set_arg(output_arg, output); queue.enqueue_1d_range_kernel(kernel, 0, global_work_size, 0); // final reduction reduce_on_cpu( make_buffer_iterator(output), make_buffer_iterator(output, global_work_size), result, function, queue ); } } // end detail namespace } // end compute namespace } // end boost namespace #endif // BOOST_COMPUTE_ALGORITHM_DETAIL_REDUCE_ON_CPU_HPP