diff -ru /home/edwin/ocaml-3.12.0+rc1/byterun//major_gc.c ./major_gc.c --- /home/edwin/ocaml-3.12.0+rc1/byterun//major_gc.c 2009-11-04 14:25:47.000000000 +0200 +++ ./major_gc.c 2010-12-02 17:02:38.000000000 +0200 @@ -286,21 +286,25 @@ { char *hp; header_t hd; + /* speed opt: keep global in local var */ + char *gc_sweep_hp = caml_gc_sweep_hp; caml_gc_message (0x40, "Sweeping %ld words\n", work); while (work > 0){ - if (caml_gc_sweep_hp < limit){ - hp = caml_gc_sweep_hp; + if (gc_sweep_hp < limit){ + hp = gc_sweep_hp; hd = Hd_hp (hp); work -= Whsize_hd (hd); - caml_gc_sweep_hp += Bhsize_hd (hd); + gc_sweep_hp += Bhsize_hd (hd); + PREFETCH_READ_NT(gc_sweep_hp); switch (Color_hd (hd)){ case Caml_white: + caml_gc_sweep_hp = gc_sweep_hp; if (Tag_hd (hd) == Custom_tag){ void (*final_fun)(value) = Custom_ops_val(Val_hp(hp))->finalize; if (final_fun != NULL) final_fun(Val_hp(hp)); } - caml_gc_sweep_hp = caml_fl_merge_block (Bp_hp (hp)); + gc_sweep_hp = caml_fl_merge_block (Bp_hp (hp)); break; case Caml_blue: /* Only the blocks of the free-list are blue. See [freelist.c]. */ @@ -311,7 +315,7 @@ Hd_hp (hp) = Whitehd_hd (hd); break; } - Assert (caml_gc_sweep_hp <= limit); + Assert (gc_sweep_hp <= limit); }else{ chunk = Chunk_next (chunk); if (chunk == NULL){ @@ -320,11 +324,12 @@ work = 0; caml_gc_phase = Phase_idle; }else{ - caml_gc_sweep_hp = chunk; + gc_sweep_hp = chunk; limit = chunk + Chunk_size (chunk); } } } + caml_gc_sweep_hp = gc_sweep_hp; } diff -ru /home/edwin/ocaml-3.12.0+rc1/byterun//memory.h ./memory.h --- /home/edwin/ocaml-3.12.0+rc1/byterun//memory.h 2008-12-03 20:09:09.000000000 +0200 +++ ./memory.h 2010-12-02 17:06:12.000000000 +0200 @@ -215,6 +215,16 @@ #define CAMLunused #endif +/* non-temporal prefetch for read (it need not be left in the cache after the access) */ +#if defined (__GNUC__) && (__GNUC__ > 3 || (__GNUC__ == 3 && __GNUC_MINOR__ > 1)) + #define PREFETCH_READ_NT(addr) __builtin_prefetch((addr), 0, 0) + #define PREFETCH_READ(addr) __builtin_prefetch((addr), 0, 3) +#else + #define PREFETCH_READ_NT(addr) + #define PREFETCH_READ(addr) +#endif + + #define CAMLxparam1(x) \ struct caml__roots_block caml__roots_##x; \ CAMLunused int caml__dummy_##x = ( \