zsh-workers
 help / color / mirror / code / Atom feed
* Optimization of string.c
@ 2015-10-05 17:23 Sebastian Gniazdowski
  2015-10-05 17:27 ` Sebastian Gniazdowski
                   ` (2 more replies)
  0 siblings, 3 replies; 4+ messages in thread
From: Sebastian Gniazdowski @ 2015-10-05 17:23 UTC (permalink / raw)
  To: zsh-workers

[-- Attachment #1: Type: text/plain, Size: 1391 bytes --]

Hello,
I attach a patch that uses memcpy instead of strcpy for strings of
length 8 and more. The four attached scripts reveal performance gains
of 10.5%, 22.4%, 8.5%, 14.7%. I didn't run tests that would e.g.
search for more optimal value that the 8. I thought that I will ask
for ideas for such tests. When is ztrdup() called often? Or rarely. I
would use the ideas and run the tests tomorrow (probably for whole
day).

I was long struggling with bug in llvm compiler. Memcpy can like
randomly slow down there. I documented this here:

https://www.youtube.com/watch?v=1HVDbU7Sbnw

The gains exceed my expectations, they were rather lower on OS X,
however the bug prevented obtaining anything that's consistent. I just
warn that the gains might be lower. I now run the tests on FreeBSD
machine. It is worth noting that the FreeBSD's memcpy isn't the
fastest one. I would say it is slow
(http://www.embedded.com/design/configurable-systems/4024961/Optimizing-Memcpy-improves-speed).
Tomorrow I will run the tests also on Linux machine.

It might be that the faster the CPU is, the lower the gain is. If
someone could repeat the tests on machine with 1.5 GHz or more it
would be of value.

- OS X 10.9.2, clang-500.2.79, zsh version used in the video 377e240,
Core i5 2.3 GHz
- FreeBSD 10.1, gcc 4.8.4, zsh version used in tests 64061e5, Pentium M 600 MHz

Best regards,
Sebastian Gniazdowski

[-- Attachment #2: copymemory.patch --]
[-- Type: application/octet-stream, Size: 4430 bytes --]

diff --git a/Src/string.c b/Src/string.c
index 04e7446..8a6c44e 100644
--- a/Src/string.c
+++ b/Src/string.c
@@ -28,16 +28,34 @@
 
 #include "zsh.mdh"
 
+// This is the version that I submit:
+#define copymemory_maybestring(dest,src,l,type) do      \
+            {                                           \
+                if( l < 8 ) {                           \
+                    type *d_;                           \
+                    const type *s_;                     \
+                    d_ = dest;                          \
+                    s_ = src;                           \
+                    while( (*d_++ = *s_++) ) {}         \
+                } else {                                \
+                    memcpy(dest, src, l*sizeof(type));  \
+                }                                       \
+            } while(0);
+
 /**/
 mod_export char *
 dupstring(const char *s)
 {
     char *t;
+    size_t lenw0;
 
     if (!s)
 	return NULL;
-    t = (char *) zhalloc(strlen((char *)s) + 1);
-    strcpy(t, s);
+
+    lenw0 = 1 + strlen((char *)s);
+    t = (char *) zhalloc(lenw0 * sizeof(char));
+    copymemory_maybestring(t, s, lenw0, char);
+
     return t;
 }
 
@@ -46,11 +64,15 @@ mod_export char *
 ztrdup(const char *s)
 {
     char *t;
+    size_t lenw0;
 
     if (!s)
 	return NULL;
-    t = (char *)zalloc(strlen((char *)s) + 1);
-    strcpy(t, s);
+
+    lenw0= 1 + strlen((char *)s);
+    t = (char *)zalloc(lenw0 * sizeof(char));
+    copymemory_maybestring(t, s, lenw0, char);
+
     return t;
 }
 
@@ -61,11 +83,15 @@ mod_export wchar_t *
 wcs_ztrdup(const wchar_t *s)
 {
     wchar_t *t;
+    size_t lenw0;
 
     if (!s)
 	return NULL;
-    t = (wchar_t *)zalloc(sizeof(wchar_t) * (wcslen((wchar_t *)s) + 1));
-    wcscpy(t, s);
+
+    lenw0 = 1 + wcslen((wchar_t *)s);
+    t = (wchar_t *)zalloc(lenw0 * sizeof(wchar_t));
+    copymemory_maybestring(t, s, lenw0, wchar_t);
+
     return t;
 }
 /**/
@@ -80,13 +106,14 @@ tricat(char const *s1, char const *s2, char const *s3)
 {
     /* This version always uses permanently-allocated space. */
     char *ptr;
-    size_t l1 = strlen(s1);
-    size_t l2 = strlen(s2);
-
-    ptr = (char *)zalloc(l1 + l2 + strlen(s3) + 1);
-    strcpy(ptr, s1);
-    strcpy(ptr + l1, s2);
-    strcpy(ptr + l1 + l2, s3);
+    size_t l1w0 = 1 + strlen(s1);
+    size_t l2w0 = 1 + strlen(s2);
+    size_t l3w0 = 1 + strlen(s3);
+
+    ptr = (char *)zalloc((l1w0 + l2w0 + l3w0 - 2) * sizeof(char));
+    copymemory_maybestring(ptr, s1, l1w0, char);
+    copymemory_maybestring(ptr + l1w0 - 1, s2, l2w0, char);
+    copymemory_maybestring(ptr + l1w0 - 1 + l2w0 - 1, s3, l3w0, char);
     return ptr;
 }
 
@@ -95,13 +122,14 @@ mod_export char *
 zhtricat(char const *s1, char const *s2, char const *s3)
 {
     char *ptr;
-    size_t l1 = strlen(s1);
-    size_t l2 = strlen(s2);
-
-    ptr = (char *)zhalloc(l1 + l2 + strlen(s3) + 1);
-    strcpy(ptr, s1);
-    strcpy(ptr + l1, s2);
-    strcpy(ptr + l1 + l2, s3);
+    size_t l1w0 = 1 + strlen(s1);
+    size_t l2w0 = 1 + strlen(s2);
+    size_t l3w0 = 1 + strlen(s3);
+
+    ptr = (char *)zhalloc((l1w0 + l2w0 + l3w0 - 2) * sizeof(char));
+    copymemory_maybestring(ptr, s1, l1w0, char);
+    copymemory_maybestring(ptr + l1w0 - 1, s2, l2w0, char);
+    copymemory_maybestring(ptr + l1w0 - 1 + l2w0 - 1, s3, l3w0, char);
     return ptr;
 }
 
@@ -113,11 +141,12 @@ dyncat(const char *s1, const char *s2)
 {
     /* This version always uses space from the current heap. */
     char *ptr;
-    size_t l1 = strlen(s1);
+    size_t l1w0 = 1 + strlen(s1);
+    size_t l2w0 = 1 + strlen(s2);
 
-    ptr = (char *)zhalloc(l1 + strlen(s2) + 1);
-    strcpy(ptr, s1);
-    strcpy(ptr + l1, s2);
+    ptr = (char *)zhalloc(l1w0 + l2w0 - 1);
+    copymemory_maybestring(ptr, s1, l1w0, char);
+    copymemory_maybestring(ptr + l1w0 - 1, s2, l2w0, char);
     return ptr;
 }
 
@@ -127,11 +156,12 @@ bicat(const char *s1, const char *s2)
 {
     /* This version always uses permanently-allocated space. */
     char *ptr;
-    size_t l1 = strlen(s1);
+    size_t l1w0 = 1 + strlen(s1);
+    size_t l2w0 = 1 + strlen(s2);
 
-    ptr = (char *)zalloc(l1 + strlen(s2) + 1);
-    strcpy(ptr, s1);
-    strcpy(ptr + l1, s2);
+    ptr = (char *)zalloc(l1w0 + l2w0 - 1);
+    copymemory_maybestring(ptr, s1, l1w0, char);
+    copymemory_maybestring(ptr + l1w0 - 1, s2, l2w0, char);
     return ptr;
 }
 

[-- Attachment #3: testopt1.zsh --]
[-- Type: application/octet-stream, Size: 429 bytes --]

#!/usr/local/bin/zsh-8opt
#!/usr/local/bin/zsh-clean

# opt:   1)    5       11351.07  2270.21  100.00%  11351.07  2270.21  100.00%  strtest
# clean: 1)    5       12671.24  2534.25  100.00%  12671.24  2534.25  100.00%  strtest

# 11351.07 / 12671.24 = 0.895814

zmodload zsh/zprof

strtest() {
    a=""

    i=4000
    while (( i -- )); do
        b=$a
        a+="$i"
    done
}

strtest
strtest
strtest
strtest
strtest

zprof

[-- Attachment #4: testopt2.zsh --]
[-- Type: application/octet-stream, Size: 414 bytes --]

#!/usr/local/bin/zsh-8opt
#!/usr/local/bin/zsh-clean

# opt:   1)    5        1771.97   354.39  100.00%   1771.97   354.39  100.00%  strtest
# clean: 1)    5        2254.05   450.81  100.00%   2254.05   450.81  100.00%  strtest

# 1771.97 / 2254.05 = 0.786127

zmodload zsh/zprof

strtest() {
    a=""

    i=4000
    while (( i -- )); do
        a+="$i"
    done
}

strtest
strtest
strtest
strtest
strtest

zprof

[-- Attachment #5: testopt3.zsh --]
[-- Type: application/octet-stream, Size: 407 bytes --]

#!/usr/local/bin/zsh-clean
#!/usr/local/bin/zsh-8opt

# opt:   1)    1       70712.42 70712.42  100.00%  70712.42 70712.42  100.00%  strtest 
# clean: 1)    1       77258.67 77258.67  100.00%  77258.67 77258.67  100.00%  strtest

# 70712.42 / 77258.67 = 0.915268

zmodload zsh/zprof

strtest() {
    a=""

    i=$(( 5*4000 ))
    while (( i -- )); do
        b=$a
        a+="$i"
    done
}

strtest

zprof

[-- Attachment #6: testopt4.zsh --]
[-- Type: application/octet-stream, Size: 391 bytes --]

#!/usr/local/bin/zsh-8opt
#!/usr/local/bin/zsh-clean

# opt:   1)    1       13155.08 13155.08  100.00%  13155.08 13155.08  100.00%  strtest
# clean: 1)    1       15420.33 15420.33  100.00%  15420.33 15420.33  100.00%  strtest

# 13155.08 / 15420.33 = 0.8531

zmodload zsh/zprof

strtest() {
    a=""

    i=$(( 5*4000 ))
    while (( i -- )); do
        a+="$i"
    done
}

strtest

zprof

^ permalink raw reply	[flat|nested] 4+ messages in thread

* Re: Optimization of string.c
  2015-10-05 17:23 Optimization of string.c Sebastian Gniazdowski
@ 2015-10-05 17:27 ` Sebastian Gniazdowski
  2015-10-05 17:53 ` Sebastian Gniazdowski
  2015-10-05 21:18 ` Bart Schaefer
  2 siblings, 0 replies; 4+ messages in thread
From: Sebastian Gniazdowski @ 2015-10-05 17:27 UTC (permalink / raw)
  To: zsh-workers

On 5 October 2015 at 19:23, Sebastian Gniazdowski
> - OS X 10.9.2, clang-500.2.79, zsh version used in the video 377e240,
> Core i5 2.3 GHz
> - FreeBSD 10.1, gcc 4.8.4, zsh version used in tests 64061e5, Pentium M 600 MHz


The FreeBSD version is 10.2

Best regards,
Sebastian Gniazdowski


^ permalink raw reply	[flat|nested] 4+ messages in thread

* Re: Optimization of string.c
  2015-10-05 17:23 Optimization of string.c Sebastian Gniazdowski
  2015-10-05 17:27 ` Sebastian Gniazdowski
@ 2015-10-05 17:53 ` Sebastian Gniazdowski
  2015-10-05 21:18 ` Bart Schaefer
  2 siblings, 0 replies; 4+ messages in thread
From: Sebastian Gniazdowski @ 2015-10-05 17:53 UTC (permalink / raw)
  To: zsh-workers

On 5 October 2015 at 19:23, Sebastian Gniazdowski
<sgniazdowski@gmail.com> wrote:
> machine. It is worth noting that the FreeBSD's memcpy isn't the
> fastest one. I would say it is slow
> (http://www.embedded.com/design/configurable-systems/4024961/Optimizing-Memcpy-improves-speed).

The correct link about FreeBSD's memcpy peformance is:
http://blog.dataparksearch.org/208

Best regards,
Sebastian Gniazdowski


^ permalink raw reply	[flat|nested] 4+ messages in thread

* Re: Optimization of string.c
  2015-10-05 17:23 Optimization of string.c Sebastian Gniazdowski
  2015-10-05 17:27 ` Sebastian Gniazdowski
  2015-10-05 17:53 ` Sebastian Gniazdowski
@ 2015-10-05 21:18 ` Bart Schaefer
  2 siblings, 0 replies; 4+ messages in thread
From: Bart Schaefer @ 2015-10-05 21:18 UTC (permalink / raw)
  To: Sebastian Gniazdowski, zsh-workers

On Oct 5,  7:23pm, Sebastian Gniazdowski wrote:
}
} It might be that the faster the CPU is, the lower the gain is. If
} someone could repeat the tests on machine with 1.5 GHz or more it
} would be of value.

I ran the tests twice each, once built without the patch (and with
--enable-zsh-debug) and the other using the patch (but not debug,
accidentally), on a VirtualBox VM inside a 3.2GHz host.  cpuinfo in
the VM guest reports the same values as the host, whether that's an
accurate measure or not.  I may try again later with more care to
be sure the compiler flags are the same.

Note in the testopt2 and testopt4 cases performance was worse with
the patch, which is why I decided to send this even though I messed
up the configure flags.  Best viewed in fixed-width.


No patch w/ debug                  |  Patch w/o debug
-----------------------------------|----------------------------------
				   |  				
testopt1                           |  testopt1			
				   |  				
num  calls                time     |  num  calls                time  
-----------------------------------|----------------------------------
 1)    5        2214.75   442.95   |   1)    5        1690.37   338.07
 1)    5        2191.67   438.33   |   1)    5        1697.04   339.41
				   |  				
-----------------------------------|----------------------------------
				   |  				
testopt2			   |  testopt2			
				   |  				
num  calls                time     |  num  calls                time  
-----------------------------------|----------------------------------
 1)    5         271.99    54.40   |   1)    5         311.95    62.39
 1)    5         274.84    54.97   |   1)    5         312.72    62.54
				   |  				
-----------------------------------|----------------------------------
				   |  				
testopt3			   |  testopt3			
				   |  				
num  calls                time     |  num  calls                time  
-----------------------------------|----------------------------------
 1)    1       13331.32 13331.32   |   1)    1       10533.46 10533.46
 1)    1       13468.76 13468.76   |   1)    1       10538.59 10538.59
				   |  				
-----------------------------------|----------------------------------
				   |  				
testopt4			   |  testopt4			
				   |  				
num  calls                time     |  num  calls                time  
-----------------------------------|----------------------------------
 1)    1        2085.65  2085.65   |   1)    1        2442.47  2442.47
 1)    1        2010.71  2010.71   |   1)    1        2523.01  2523.01
				   |  				
-----------------------------------|----------------------------------


^ permalink raw reply	[flat|nested] 4+ messages in thread

end of thread, other threads:[~2015-10-05 21:19 UTC | newest]

Thread overview: 4+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2015-10-05 17:23 Optimization of string.c Sebastian Gniazdowski
2015-10-05 17:27 ` Sebastian Gniazdowski
2015-10-05 17:53 ` Sebastian Gniazdowski
2015-10-05 21:18 ` Bart Schaefer

Code repositories for project(s) associated with this public inbox

	https://git.vuxu.org/mirror/zsh/

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).