//---------------------------------------------------------------------------// // Copyright (c) 2013 Kyle Lutz // // Distributed under the Boost Software License, Version 1.0 // See accompanying file LICENSE_1_0.txt or copy at // http://www.boost.org/LICENSE_1_0.txt // // See http://boostorg.github.com/compute for more information. //---------------------------------------------------------------------------// #ifndef BOOST_COMPUTE_ALGORITHM_DETAIL_COUNT_IF_WITH_THREADS_HPP #define BOOST_COMPUTE_ALGORITHM_DETAIL_COUNT_IF_WITH_THREADS_HPP #include #include #include namespace boost { namespace compute { namespace detail { template class count_if_with_threads_kernel : meta_kernel { public: typedef typename std::iterator_traits::value_type value_type; count_if_with_threads_kernel() : meta_kernel("count_if_with_threads") { } void set_args(InputIterator first, InputIterator last, Predicate predicate) { typedef typename std::iterator_traits::value_type T; m_size = detail::iterator_range_size(first, last); m_size_arg = add_arg("size"); m_counts_arg = add_arg(memory_object::global_memory, "counts"); *this << // thread parameters "const uint gid = get_global_id(0);\n" << "const uint block_size = size / get_global_size(0);\n" << "const uint start = block_size * gid;\n" << "uint end = 0;\n" << "if(gid == get_global_size(0) - 1)\n" << " end = size;\n" << "else\n" << " end = block_size * gid + block_size;\n" << // count values "uint count = 0;\n" << "for(uint i = start; i < end; i++){\n" << decl("value") << "=" << first[expr("i")] << ";\n" << if_(predicate(var("value"))) << "{\n" << "count++;\n" << "}\n" << "}\n" << // write count "counts[gid] = count;\n"; } size_t exec(command_queue &queue) { const device &device = queue.get_device(); const context &context = queue.get_context(); size_t threads = device.compute_units(); const size_t minimum_block_size = 2048; if(m_size / threads < minimum_block_size){ threads = static_cast( (std::max)( std::ceil(float(m_size) / minimum_block_size), 1.0f ) ); } // storage for counts ::boost::compute::vector counts(threads, context); // exec kernel set_arg(m_size_arg, static_cast(m_size)); set_arg(m_counts_arg, counts.get_buffer()); exec_1d(queue, 0, threads, 1); // copy counts to the host std::vector host_counts(threads); ::boost::compute::copy(counts.begin(), counts.end(), host_counts.begin(), queue); // return sum of counts return std::accumulate(host_counts.begin(), host_counts.end(), size_t(0)); } private: size_t m_size; size_t m_size_arg; size_t m_counts_arg; }; // counts values that match the predicate using one thread per block. this is // optimized for cpu-type devices with a small number of compute units. template inline size_t count_if_with_threads(InputIterator first, InputIterator last, Predicate predicate, command_queue &queue) { count_if_with_threads_kernel kernel; kernel.set_args(first, last, predicate); return kernel.exec(queue); } } // end detail namespace } // end compute namespace } // end boost namespace #endif // BOOST_COMPUTE_ALGORITHM_DETAIL_COUNT_IF_WITH_THREADS_HPP