Blackfin arch: Replace C version of 64 bit multiply with hand optimized assembly

Signed-off-by: Bernd Schmidt <bernds_cb1@t-online.de>
Signed-off-by: Bryan Wu <cooloney@kernel.org>
This commit is contained in:
Bernd Schmidt 2009-01-07 23:14:39 +08:00 committed by Bryan Wu
parent 36478585d9
commit 71ae92f51a
2 changed files with 68 additions and 99 deletions

View file

@ -0,0 +1,68 @@
.align 2
.global ___muldi3;
.type ___muldi3, STT_FUNC;
#ifdef CONFIG_ARITHMETIC_OPS_L1
.section .l1.text
#else
.text
#endif
/*
R1:R0 * R3:R2
= R1.h:R1.l:R0.h:R0.l * R3.h:R3.l:R2.h:R2.l
[X] = (R1.h * R3.h) * 2^96
[X] + (R1.h * R3.l + R1.l * R3.h) * 2^80
[X] + (R1.h * R2.h + R1.l * R3.l + R3.h * R0.h) * 2^64
[T1] + (R1.h * R2.l + R3.h * R0.l + R1.l * R2.h + R3.l * R0.h) * 2^48
[T2] + (R1.l * R2.l + R3.l * R0.l + R0.h * R2.h) * 2^32
[T3] + (R0.l * R2.h + R2.l * R0.h) * 2^16
[T4] + (R0.l * R2.l)
We can discard the first three lines marked "X" since we produce
only a 64 bit result. So, we need ten 16-bit multiplies.
Individual mul-acc results:
[E1] = R1.h * R2.l + R3.h * R0.l + R1.l * R2.h + R3.l * R0.h
[E2] = R1.l * R2.l + R3.l * R0.l + R0.h * R2.h
[E3] = R0.l * R2.h + R2.l * R0.h
[E4] = R0.l * R2.l
We also need to add high parts from lower-level results to higher ones:
E[n]c = E[n] + (E[n+1]c >> 16), where E4c := E4
One interesting property is that all parts of the result that depend
on the sign of the multiplication are discarded. Those would be the
multiplications involving R1.h and R3.h, but only the top 16 bit of
the 32 bit result depend on the sign, and since R1.h and R3.h only
occur in E1, the top half of these results is cut off.
So, we can just use FU mode for all of the 16-bit multiplies, and
ignore questions of when to use mixed mode. */
___muldi3:
/* [SP] technically is part of the caller's frame, but we can
use it as scratch space. */
A0 = R2.H * R1.L, A1 = R2.L * R1.H (FU) || R3 = [SP + 12]; /* E1 */
A0 += R3.H * R0.L, A1 += R3.L * R0.H (FU) || [SP] = R4; /* E1 */
A0 += A1; /* E1 */
R4 = A0.w;
A0 = R0.l * R3.l (FU); /* E2 */
A0 += R2.l * R1.l (FU); /* E2 */
A1 = R2.L * R0.L (FU); /* E4 */
R3 = A1.w;
A1 = A1 >> 16; /* E3c */
A0 += R2.H * R0.H, A1 += R2.L * R0.H (FU); /* E2, E3c */
A1 += R0.L * R2.H (FU); /* E3c */
R0 = A1.w;
A1 = A1 >> 16; /* E2c */
A0 += A1; /* E2c */
R1 = A0.w;
/* low(result) = low(E3c):low(E4) */
R0 = PACK (R0.l, R3.l);
/* high(result) = E2c + (E1 << 16) */
R1.h = R1.h + R4.l (NS) || R4 = [SP];
RTS;
.size ___muldi3, .-___muldi3

View file

@ -1,99 +0,0 @@
/*
* File: arch/blackfin/lib/muldi3.c
* Based on:
* Author:
*
* Created:
* Description:
*
* Modified:
* Copyright 2004-2006 Analog Devices Inc.
*
* Bugs: Enter bugs at http://blackfin.uclinux.org/
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, see the file COPYING, or write
* to the Free Software Foundation, Inc.,
* 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
*/
#ifndef SI_TYPE_SIZE
#define SI_TYPE_SIZE 32
#endif
#define __ll_b (1L << (SI_TYPE_SIZE / 2))
#define __ll_lowpart(t) ((usitype) (t) % __ll_b)
#define __ll_highpart(t) ((usitype) (t) / __ll_b)
#define BITS_PER_UNIT 8
#if !defined(umul_ppmm)
#define umul_ppmm(w1, w0, u, v) \
do { \
usitype __x0, __x1, __x2, __x3; \
usitype __ul, __vl, __uh, __vh; \
\
__ul = __ll_lowpart (u); \
__uh = __ll_highpart (u); \
__vl = __ll_lowpart (v); \
__vh = __ll_highpart (v); \
\
__x0 = (usitype) __ul * __vl; \
__x1 = (usitype) __ul * __vh; \
__x2 = (usitype) __uh * __vl; \
__x3 = (usitype) __uh * __vh; \
\
__x1 += __ll_highpart (__x0);/* this can't give carry */ \
__x1 += __x2; /* but this indeed can */ \
if (__x1 < __x2) /* did we get it? */ \
__x3 += __ll_b; /* yes, add it in the proper pos. */ \
\
(w1) = __x3 + __ll_highpart (__x1); \
(w0) = __ll_lowpart (__x1) * __ll_b + __ll_lowpart (__x0); \
} while (0)
#endif
#if !defined(__umulsidi3)
#define __umulsidi3(u, v) \
({diunion __w; \
umul_ppmm (__w.s.high, __w.s.low, u, v); \
__w.ll; })
#endif
typedef unsigned int usitype __attribute__ ((mode(SI)));
typedef int sitype __attribute__ ((mode(SI)));
typedef int ditype __attribute__ ((mode(DI)));
typedef int word_type __attribute__ ((mode(__word__)));
struct distruct {
sitype low, high;
};
typedef union {
struct distruct s;
ditype ll;
} diunion;
#ifdef CONFIG_ARITHMETIC_OPS_L1
ditype __muldi3(ditype u, ditype v)__attribute__((l1_text));
#endif
ditype __muldi3(ditype u, ditype v)
{
diunion w;
diunion uu, vv;
uu.ll = u, vv.ll = v;
w.ll = __umulsidi3(uu.s.low, vv.s.low);
w.s.high += ((usitype) uu.s.low * (usitype) vv.s.high
+ (usitype) uu.s.high * (usitype) vv.s.low);
return w.ll;
}