/* * Copyright (c) 1997-1999 Massachusetts Institute of Technology * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA * */ /* * executor_threads.c -- execute the fft in parallel using threads */ #include #include #include static void executor_simple_threads(int n, const fftw_complex *in, fftw_complex *out, fftw_plan_node *p, int istride, int ostride, int nthreads); typedef struct { int m,r; const fftw_complex *in; fftw_complex *out; fftw_plan_node *p; int istride, ostride; int nthreads; } executor_simple_data; static void *executor_simple_thread(fftw_loop_data *loop_data) { int min = loop_data->min, max = loop_data->max; executor_simple_data *d = (executor_simple_data *) loop_data->data; int m = d->m, r = d->r; const fftw_complex *in = d->in; fftw_complex *out = d->out; fftw_plan_node *p = d->p; int istride = d->istride, ostride = d->ostride; int nthreads = d->nthreads; for (; min < max; ++min) executor_simple_threads(m, in + min * istride, out + min * (m * ostride), p, istride * r, ostride, nthreads); return 0; } typedef struct { fftw_twiddle_codelet *codelet; int m, ntwiddle, ostride; fftw_complex *out, *W; } twiddle_thread_data; static void *twiddle_thread(fftw_loop_data *loop_data) { twiddle_thread_data *d = (twiddle_thread_data *) loop_data->data; HACK_ALIGN_STACK_EVEN; (d->codelet)(d->out + d->ostride * loop_data->min, d->W + d->ntwiddle * loop_data->min, d->m * d->ostride, loop_data->max - loop_data->min, d->ostride); return 0; } static void executor_simple_threads(int n, const fftw_complex *in, fftw_complex *out, fftw_plan_node *p, int istride, int ostride, int nthreads) { switch (p->type) { case FFTW_NOTW: HACK_ALIGN_STACK_ODD; (p->nodeu.notw.codelet) (in, out, istride, ostride); break; case FFTW_TWIDDLE: { int r = p->nodeu.twiddle.size; int m = n / r; int i; if (nthreads <= 1) { fftw_twiddle_codelet *codelet; fftw_complex *W; for (i = 0; i < r; ++i) { fftw_executor_simple(m, in + i * istride, out + i * (m * ostride), p->nodeu.twiddle.recurse, istride * r, ostride, FFTW_NORMAL_RECURSE); } codelet = p->nodeu.twiddle.codelet; W = p->nodeu.twiddle.tw->twarray; HACK_ALIGN_STACK_EVEN; codelet(out, W, m * ostride, m, ostride); } else { { executor_simple_data d; d.m = m; d.r = r; d.in = in; d.out = out; d.p = p->nodeu.twiddle.recurse; d.istride = istride; d.ostride = ostride; d.nthreads = nthreads / r; fftw_thread_spawn_loop(r, nthreads, executor_simple_thread,&d); } { twiddle_thread_data d; d.codelet = p->nodeu.twiddle.codelet; d.m = m; d.ntwiddle = p->nodeu.twiddle.codelet_desc->ntwiddle; d.ostride = ostride; d.out = out; d.W = p->nodeu.twiddle.tw->twarray; fftw_thread_spawn_loop(m, nthreads, twiddle_thread, &d); } } break; } case FFTW_RADER: { int r = p->nodeu.twiddle.size; int m = n / r; int i; if (nthreads <= 1) { fftw_rader_codelet *codelet; fftw_complex *W; for (i = 0; i < r; ++i) { fftw_executor_simple(m, in + i * istride, out + i * (m * ostride), p->nodeu.rader.recurse, istride * r, ostride, FFTW_NORMAL_RECURSE); } codelet = p->nodeu.rader.codelet; W = p->nodeu.rader.tw->twarray; codelet(out, W, m, r, ostride, p->nodeu.rader.rader_data); } else { { executor_simple_data d; d.m = m; d.r = r; d.in = in; d.out = out; d.p = p->nodeu.rader.recurse; d.istride = istride; d.ostride = ostride; d.nthreads = nthreads / r; fftw_thread_spawn_loop(r, nthreads, executor_simple_thread,&d); } { fftw_rader_codelet *codelet; fftw_complex *W; codelet = p->nodeu.rader.codelet; W = p->nodeu.rader.tw->twarray; codelet(out, W, m, r, ostride, p->nodeu.rader.rader_data); } } break; } case FFTW_GENERIC: { int r = p->nodeu.generic.size; int m = n / r; int i; fftw_generic_codelet *codelet; fftw_complex *W; if (nthreads <= 1) for (i = 0; i < r; ++i) { fftw_executor_simple(m, in + i * istride, out + i * (m * ostride), p->nodeu.generic.recurse, istride * r, ostride, FFTW_NORMAL_RECURSE); } else { executor_simple_data d; d.m = m; d.r = r; d.in = in; d.out = out; d.p = p->nodeu.generic.recurse; d.istride = istride; d.ostride = ostride; d.nthreads = nthreads / r; fftw_thread_spawn_loop(r, nthreads, executor_simple_thread, &d); } codelet = p->nodeu.generic.codelet; W = p->nodeu.generic.tw->twarray; codelet(out, W, m, r, n, ostride); break; } default: fftw_die("BUG in executor: invalid plan\n"); break; } } static void executor_simple_inplace_threads(int n, fftw_complex *in, fftw_complex *out, fftw_plan_node *p, int istride, int nthreads) { switch (p->type) { case FFTW_NOTW: HACK_ALIGN_STACK_ODD; (p->nodeu.notw.codelet) (in, in, istride, istride); break; default: { fftw_complex *tmp; if (out) tmp = out; else tmp = (fftw_complex *) fftw_malloc(n * sizeof(fftw_complex)); executor_simple_threads(n, in, tmp, p, istride, 1, nthreads); fftw_strided_copy(n, tmp, istride, in); if (!out) fftw_free(tmp); } } } typedef struct { union { fftw_notw_codelet *codelet; struct { int n; fftw_plan_node *p; } plan; } u; const fftw_complex *in; fftw_complex *out; int idist, odist, istride, ostride; } executor_many_data; static void *executor_many_codelet_thread(fftw_loop_data *loop_data) { int min = loop_data->min, max = loop_data->max; executor_many_data *d = (executor_many_data *) loop_data->data; fftw_notw_codelet *codelet = d->u.codelet; const fftw_complex *in = d->in; fftw_complex *out = d->out; int idist = d->idist, odist = d->odist; int istride = d->istride, ostride = d->ostride; HACK_ALIGN_STACK_ODD; for (; min < max; ++min) codelet(in + min * idist, out + min * odist, istride, ostride); return 0; } static void *executor_many_simple_thread(fftw_loop_data *loop_data) { int min = loop_data->min, max = loop_data->max; executor_many_data *d = (executor_many_data *) loop_data->data; int n = d->u.plan.n; fftw_plan_node *p = d->u.plan.p; const fftw_complex *in = d->in; fftw_complex *out = d->out; int idist = d->idist, odist = d->odist; int istride = d->istride, ostride = d->ostride; for (; min < max; ++min) fftw_executor_simple(n, in + min * idist, out + min * odist, p, istride, ostride, FFTW_NORMAL_RECURSE); return 0; } static void executor_many_threads(int n, const fftw_complex *in, fftw_complex *out, fftw_plan_node *p, int istride, int ostride, int howmany, int idist, int odist, int nthreads) { switch (p->type) { case FFTW_NOTW: { int s; if (nthreads <= 1) { fftw_notw_codelet *codelet = p->nodeu.notw.codelet; HACK_ALIGN_STACK_ODD; for (s = 0; s < howmany; ++s) codelet(in + s * idist, out + s * odist, istride, ostride); } else { executor_many_data d; d.in = in; d.out = out; d.u.codelet = p->nodeu.notw.codelet; d.istride = istride; d.ostride = ostride; d.idist = idist; d.odist = odist; fftw_thread_spawn_loop(howmany, nthreads, executor_many_codelet_thread, &d); } break; } default: { int s; if (nthreads <= 1) for (s = 0; s < howmany; ++s) { fftw_executor_simple(n, in + s * idist, out + s * odist, p, istride, ostride, FFTW_NORMAL_RECURSE); } else { executor_many_data d; d.in = in; d.out = out; d.u.plan.n = n; d.u.plan.p = p; d.istride = istride; d.ostride = ostride; d.idist = idist; d.odist = odist; fftw_thread_spawn_loop(howmany, nthreads, executor_many_simple_thread, &d); } } } } typedef struct { union { fftw_notw_codelet *codelet; struct { int n; fftw_plan_node *p; fftw_complex *tmp; } plan; } u; fftw_complex *in; int idist, istride; } executor_many_inplace_data; static void *executor_many_inplace_codelet_thread(fftw_loop_data *loop_data) { int min = loop_data->min, max = loop_data->max; executor_many_inplace_data *d = (executor_many_inplace_data *) loop_data->data; fftw_notw_codelet *codelet = d->u.codelet; fftw_complex *in = d->in; int idist = d->idist, istride = d->istride; HACK_ALIGN_STACK_ODD; for (; min < max; ++min) codelet(in + min * idist, in + min * idist, istride, istride); return 0; } static void *executor_many_inplace_simple_thread(fftw_loop_data *loop_data) { int min = loop_data->min, max = loop_data->max; executor_many_inplace_data *d = (executor_many_inplace_data *) loop_data->data; int n = d->u.plan.n; fftw_plan_node *p = d->u.plan.p; fftw_complex *tmp = d->u.plan.tmp + n * loop_data->thread_num; fftw_complex *in = d->in; int idist = d->idist, istride = d->istride; for (; min < max; ++min) { fftw_executor_simple(n, in + min * idist, tmp, p, istride, 1, FFTW_NORMAL_RECURSE); fftw_strided_copy(n, tmp, istride, in + min * idist); } return 0; } void fftw_executor_many_inplace_threads(int n, fftw_complex *in, fftw_complex *work, fftw_plan_node *p, int istride, int howmany, int idist, int nthreads) { switch (p->type) { case FFTW_NOTW: { int s; if (nthreads <= 1) { fftw_notw_codelet *codelet = p->nodeu.notw.codelet; HACK_ALIGN_STACK_ODD; for (s = 0; s < howmany; ++s) codelet(in + s * idist, in + s * idist, istride, istride); } else { executor_many_inplace_data d; d.in = in; d.u.codelet = p->nodeu.notw.codelet; d.istride = istride; d.idist = idist; fftw_thread_spawn_loop(howmany, nthreads, executor_many_inplace_codelet_thread, &d); } break; } default: { int s; fftw_complex *tmp; if (nthreads <= 1) { if (work) tmp = work; else tmp = (fftw_complex *) fftw_malloc(n * sizeof(fftw_complex)); for (s = 0; s < howmany; ++s) { fftw_executor_simple(n, in + s * idist, tmp, p, istride, 1, FFTW_NORMAL_RECURSE); fftw_strided_copy(n, tmp, istride, in + s * idist); } } else { executor_many_inplace_data d; if (work) tmp = work; else tmp = (fftw_complex *) fftw_malloc((nthreads > howmany ? howmany : nthreads) * n * sizeof(fftw_complex)); d.in = in; d.u.plan.n = n; d.u.plan.p = p; d.u.plan.tmp = tmp; d.istride = istride; d.idist = idist; fftw_thread_spawn_loop(howmany, nthreads, executor_many_inplace_simple_thread, &d); } if (!work) fftw_free(tmp); } } } /* user interface */ void fftw_threads(int nthreads, fftw_plan plan, int howmany, fftw_complex *in, int istride, int idist, fftw_complex *out, int ostride, int odist) { int n = plan->n; if (plan->flags & FFTW_IN_PLACE) { if (howmany == 1) { executor_simple_inplace_threads(n, in, out, plan->root, istride, nthreads); } else { fftw_executor_many_inplace_threads(n, in, NULL, plan->root, istride, howmany, idist, nthreads); } } else { if (howmany == 1) { executor_simple_threads(n, in, out, plan->root, istride, ostride, nthreads); } else { executor_many_threads(n, in, out, plan->root, istride, ostride, howmany, idist, odist, nthreads); } } } void fftw_threads_one(int nthreads, fftw_plan plan, fftw_complex *in, fftw_complex *out) { if (plan->flags & FFTW_IN_PLACE) executor_simple_inplace_threads(plan->n, in, out, plan->root, 1, nthreads); else executor_simple_threads(plan->n, in, out, plan->root, 1, 1, nthreads); }