From mboxrd@z Thu Jan 1 00:00:00 1970 X-Msuck: nntp://news.gmane.org/gmane.linux.lib.musl.general/14986 Path: news.gmane.org!.POSTED.blaine.gmane.org!not-for-mail From: "Stefan Kanthak" Newsgroups: gmane.linux.lib.musl.general Subject: More patches for math subtree Date: Tue, 10 Dec 2019 17:57:55 +0100 Organization: Me, myself & IT Message-ID: <2C3325A208DA4260A1A0F7B4517D6DFA@H270> Reply-To: musl@lists.openwall.com Mime-Version: 1.0 Content-Type: text/plain; charset="iso-8859-1" Content-Transfer-Encoding: 7bit Injection-Info: blaine.gmane.org; posting-host="blaine.gmane.org:195.159.176.226"; logging-data="193214"; mail-complaints-to="usenet@blaine.gmane.org" To: Original-X-From: musl-return-15002-gllmg-musl=m.gmane.org@lists.openwall.com Tue Dec 10 18:02:25 2019 Return-path: Envelope-to: gllmg-musl@m.gmane.org Original-Received: from mother.openwall.net ([195.42.179.200]) by blaine.gmane.org with smtp (Exim 4.89) (envelope-from ) id 1ieiu1-000oBF-JY for gllmg-musl@m.gmane.org; Tue, 10 Dec 2019 18:02:25 +0100 Original-Received: (qmail 8183 invoked by uid 550); 10 Dec 2019 17:02:23 -0000 Mailing-List: contact musl-help@lists.openwall.com; run by ezmlm Precedence: bulk List-Post: List-Help: List-Unsubscribe: List-Subscribe: List-ID: Original-Received: (qmail 5778 invoked from network); 10 Dec 2019 16:59:16 -0000 X-Priority: 3 X-MSMail-Priority: Normal X-Mailer: Microsoft Windows Mail 6.0.6002.18197 X-MimeOLE: Produced By Microsoft MimeOLE V6.1.7601.24158 X-VADE-STATUS: LEGIT Xref: news.gmane.org gmane.linux.lib.musl.general:14986 Archived-At: Some more optimisations: the current implementations of ceil(), floor() and trunc() for i386 change the rounding control using fldcw instructions, which are SLOW; these patches provide faster and smaller branch-free (!) implementations. JFTR: I'm NOT subscribed to your mailing list, so CC: me in replies! --- -/src/math/i386/floor.s +++ +/src/math/i386/floor.s @@ -1,67 +1,26 @@ .global floorf .type floorf,@function floorf: flds 4(%esp) jmp 1f .global floorl .type floorl,@function floorl: fldt 4(%esp) jmp 1f .global floor .type floor,@function floor: fldl 4(%esp) +1: fld %st(0) + frndint + fxch %st(1) + fucomip %st(1),%st(0) + fld1 + fldz + fcmovb %st(1),%st(0) + fsubp %st(0),%st(2) + fstp %st(0) + ret -1: mov $0x7,%al -1: fstcw 4(%esp) - mov 5(%esp),%ah - mov %al,5(%esp) - fldcw 4(%esp) - frndint - mov %ah,5(%esp) - fldcw 4(%esp) - ret - -.global ceil -.type ceil,@function -ceil: - fldl 4(%esp) - mov $0xb,%al - jmp 1b - -.global ceilf -.type ceilf,@function -ceilf: - flds 4(%esp) - mov $0xb,%al - jmp 1b - -.global ceill -.type ceill,@function -ceill: - fldt 4(%esp) - mov $0xb,%al - jmp 1b - -.global trunc -.type trunc,@function -trunc: - fldl 4(%esp) - mov $0xf,%al - jmp 1b - -.global truncf -.type truncf,@function -truncf: - flds 4(%esp) - mov $0xf,%al - jmp 1b - -.global truncl -.type truncl,@function -truncl: - fldt 4(%esp) - mov $0xf,%al - jmp 1b --- -/src/math/i386/ceilf.s +++ +/src/math/i386/ceilf.s @@ -1,1 +1,1 @@ -# see floor.s +# see ceil.s --- -/src/math/i386/ceill.s +++ +/src/math/i386/ceill.s @@ -1,1 +1,1 @@ -# see floor.s +# see ceil.s --- -/src/math/i386/ceil.s +++ +/src/math/i386/ceil.s @@ -1,1 +1,26 @@ -# see floor.s +.global ceilf +.type ceilf,@function +ceilf: + flds 4(%esp) + jmp 1f + +.global ceill +.type ceill,@function +ceill: + fldt 4(%esp) + jmp 1f + +.global ceil +.type ceil,@function +ceil: + fldl 4(%esp) +1: fld %st(0) + frndint + fxch %st(1) + fucomip %st(1),%st(0) + fld1 + fldz + fcmovnbe %st(1),%st(0) + faddp %st(0),%st(1) + fstp %st(0) + ret --- -/src/math/i386/truncf.s +++ +/src/math/i386/truncf.s @@ -1,1 +1,1 @@ -# see floor.s +# see trunc.s --- -/src/math/i386/truncl.s +++ +/src/math/i386/truncl.s @@ -1,1 +1,1 @@ -# see floor.s +# see trunc.s --- -/src/math/i386/trunc.s +++ +/src/math/i386/trunc.s @@ -1,1 +1,32 @@ -# see floor.s +.global truncf +.type truncf,@function +truncf: + flds 4(%esp) + jmp 1f + +.global truncl +.type truncl,@function +truncl: + fldt 4(%esp) + jmp 1f + +.global trunc +.type trunc,@function +trunc: + fldl 4(%esp) +1: fld %st(0) + fabs + fld %st(0) + frndint + fxch %st(1) + fucomip %st(1),%st(0) + fldz + fld1 + fcmovnb %st(1),%st(0) + fsubp %st(0),%st(2) + fucomip %st(2),%st(0) + fst %st(1) + fchs + fcmovbe %st(1),%st(0) + fstp %st(1) + ret