//---------------------------------------------------------------------------//
// Copyright (c) 2013 Kyle Lutz
//
// Distributed under the Boost Software License, Version 1.0
// See accompanying file LICENSE_1_0.txt or copy at
// http://www.boost.org/LICENSE_1_0.txt
//
// See http://boostorg.github.com/compute for more information.
//---------------------------------------------------------------------------//
#ifndef BOOST_COMPUTE_ALGORITHM_REDUCE_HPP
#define BOOST_COMPUTE_ALGORITHM_REDUCE_HPP
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
namespace boost {
namespace compute {
namespace detail {
template
size_t reduce(InputIterator first,
size_t count,
OutputIterator result,
size_t block_size,
BinaryFunction function,
command_queue &queue)
{
typedef typename
std::iterator_traits::value_type
input_type;
typedef typename
boost::compute::result_of::type
result_type;
const context &context = queue.get_context();
size_t block_count = count / 2 / block_size;
size_t total_block_count =
static_cast(std::ceil(float(count) / 2.f / float(block_size)));
if(block_count != 0){
meta_kernel k("block_reduce");
size_t output_arg = k.add_arg(memory_object::global_memory, "output");
size_t block_arg = k.add_arg(memory_object::local_memory, "block");
k <<
"const uint gid = get_global_id(0);\n" <<
"const uint lid = get_local_id(0);\n" <<
// copy values to local memory
"block[lid] = " <<
function(first[k.make_var("gid*2+0")],
first[k.make_var("gid*2+1")]) << ";\n" <<
// perform reduction
"for(uint i = 1; i < " << uint_(block_size) << "; i <<= 1){\n" <<
" barrier(CLK_LOCAL_MEM_FENCE);\n" <<
" uint mask = (i << 1) - 1;\n" <<
" if((lid & mask) == 0){\n" <<
" block[lid] = " <<
function(k.expr("block[lid]"),
k.expr("block[lid+i]")) << ";\n" <<
" }\n" <<
"}\n" <<
// write block result to global output
"if(lid == 0)\n" <<
" output[get_group_id(0)] = block[0];\n";
kernel kernel = k.compile(context);
kernel.set_arg(output_arg, result.get_buffer());
kernel.set_arg(block_arg, local_buffer(block_size));
queue.enqueue_1d_range_kernel(kernel,
0,
block_count * block_size,
block_size);
}
// serially reduce any leftovers
if(block_count * block_size * 2 < count){
size_t last_block_start = block_count * block_size * 2;
meta_kernel k("extra_serial_reduce");
size_t count_arg = k.add_arg("count");
size_t offset_arg = k.add_arg("offset");
size_t output_arg = k.add_arg(memory_object::global_memory, "output");
size_t output_offset_arg = k.add_arg("output_offset");
k <<
k.decl("result") << " = \n" <<
first[k.expr("offset")] << ";\n" <<
"for(uint i = offset + 1; i < count; i++)\n" <<
" result = " <<
function(k.var("result"),
first[k.var("i")]) << ";\n" <<
"output[output_offset] = result;\n";
kernel kernel = k.compile(context);
kernel.set_arg(count_arg, static_cast(count));
kernel.set_arg(offset_arg, static_cast(last_block_start));
kernel.set_arg(output_arg, result.get_buffer());
kernel.set_arg(output_offset_arg, static_cast(block_count));
queue.enqueue_task(kernel);
}
return total_block_count;
}
template
inline vector<
typename boost::compute::result_of<
BinaryFunction(
typename std::iterator_traits::value_type,
typename std::iterator_traits::value_type
)
>::type
>
block_reduce(InputIterator first,
size_t count,
size_t block_size,
BinaryFunction function,
command_queue &queue)
{
typedef typename
std::iterator_traits::value_type
input_type;
typedef typename
boost::compute::result_of::type
result_type;
const context &context = queue.get_context();
size_t total_block_count =
static_cast(std::ceil(float(count) / 2.f / float(block_size)));
vector result_vector(total_block_count, context);
reduce(first, count, result_vector.begin(), block_size, function, queue);
return result_vector;
}
// Space complexity: O( ceil(n / 2 / 256) )
template
inline void generic_reduce(InputIterator first,
InputIterator last,
OutputIterator result,
BinaryFunction function,
command_queue &queue)
{
typedef typename
std::iterator_traits::value_type
input_type;
typedef typename
boost::compute::result_of::type
result_type;
const device &device = queue.get_device();
const context &context = queue.get_context();
size_t count = detail::iterator_range_size(first, last);
if(device.type() & device::cpu){
array value(context);
detail::reduce_on_cpu(first, last, value.begin(), function, queue);
boost::compute::copy_n(value.begin(), 1, result, queue);
}
else {
size_t block_size = 256;
// first pass
vector results = detail::block_reduce(first,
count,
block_size,
function,
queue);
if(results.size() > 1){
detail::inplace_reduce(results.begin(),
results.end(),
function,
queue);
}
boost::compute::copy_n(results.begin(), 1, result, queue);
}
}
template
inline void dispatch_reduce(InputIterator first,
InputIterator last,
OutputIterator result,
const plus &function,
command_queue &queue)
{
const context &context = queue.get_context();
const device &device = queue.get_device();
// reduce to temporary buffer on device
array value(context);
if(device.type() & device::cpu){
detail::reduce_on_cpu(first, last, value.begin(), function, queue);
}
else {
reduce_on_gpu(first, last, value.begin(), function, queue);
}
// copy to result iterator
copy_n(value.begin(), 1, result, queue);
}
template
inline void dispatch_reduce(InputIterator first,
InputIterator last,
OutputIterator result,
BinaryFunction function,
command_queue &queue)
{
generic_reduce(first, last, result, function, queue);
}
} // end detail namespace
/// Returns the result of applying \p function to the elements in the
/// range [\p first, \p last).
///
/// If no function is specified, \c plus will be used.
///
/// \param first first element in the input range
/// \param last last element in the input range
/// \param result iterator pointing to the output
/// \param function binary reduction function
/// \param queue command queue to perform the operation
///
/// The \c reduce() algorithm assumes that the binary reduction function is
/// associative. When used with non-associative functions the result may
/// be non-deterministic and vary in precision. Notably this affects the
/// \c plus() function as floating-point addition is not associative
/// and may produce slightly different results than a serial algorithm.
///
/// This algorithm supports both host and device iterators for the
/// result argument. This allows for values to be reduced and copied
/// to the host all with a single function call.
///
/// For example, to calculate the sum of the values in a device vector and
/// copy the result to a value on the host:
///
/// \snippet test/test_reduce.cpp sum_int
///
/// Note that while the the \c reduce() algorithm is conceptually identical to
/// the \c accumulate() algorithm, its implementation is substantially more
/// efficient on parallel hardware. For more information, see the documentation
/// on the \c accumulate() algorithm.
///
/// Space complexity on GPUs: \Omega(n)
/// Space complexity on CPUs: \Omega(1)
///
/// \see accumulate()
template
inline void reduce(InputIterator first,
InputIterator last,
OutputIterator result,
BinaryFunction function,
command_queue &queue = system::default_queue())
{
BOOST_STATIC_ASSERT(is_device_iterator::value);
if(first == last){
return;
}
detail::dispatch_reduce(first, last, result, function, queue);
}
/// \overload
template
inline void reduce(InputIterator first,
InputIterator last,
OutputIterator result,
command_queue &queue = system::default_queue())
{
BOOST_STATIC_ASSERT(is_device_iterator::value);
typedef typename std::iterator_traits::value_type T;
if(first == last){
return;
}
detail::dispatch_reduce(first, last, result, plus(), queue);
}
} // end compute namespace
} // end boost namespace
#endif // BOOST_COMPUTE_ALGORITHM_REDUCE_HPP