--------------------------------------------------------------------------- PATCH FOR ABSOLUTE SHARING IN MARSHALLED DATA FOR OCAML --------------------------------------------------------------------------- LICENSE Copyright (C) 2006 Jane Street Holding, LLC Address: 111 Broadway, 21st Floor, 10006 New York, USA Author: Markus Mottl All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. * Neither the name of Jane Street Holding, LLC, nor the names of its contributors may be used to endorse or promote products derived from this patch without specific prior written permission. THIS PATCH IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS PATCH, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. --------------------------------------------------------------------------- APPLYING THE PATCH The patch was generated against the CVS-release of OCaml as of 2006-06-28, but may also work with other releases. To apply it just check out OCaml from the CVS-server at INRIA, or download the source distribution, and change to the source directory. Apply the patch, e.g.: patch -p0 < {path to this patch}/absolute.patch Now build OCaml from source as you would otherwise do (see the INSTALL-file in the OCaml-distribution for more details). Finally, install OCaml with "make install", and you are done. --------------------------------------------------------------------------- USAGE There is a new flag in module Marshal: Absolute_sharing. When passed during marshalling data, shared values will be encoded using absolute rather than relative offsets. This may change the size of the resulting data, sometimes slightly increasing, other times slightly decreasing it. Generally, if there are only a few shared values, absolute sharing will produce better results than the default relative sharing. If there are many shared values and if they are in close proximity to their pointers, the default relative sharing will work better. The biggest difference is that absolute sharing may often lead to greatly improved compression results when applying compressors like e.g. gzip to the data, because patterns of absolute addresses to shared values are constant and can be exploited by the compressor. Heavily shared data as e.g. resulting from hashconsing may only require a small fraction of storage space when compressed as opposed to compressed data encoded using relative sharing. We have seen orders of magnitude difference on artificial data, and factors of three on large production data. Values marshalled with older OCaml-runtimes are fully supported. Values marshalled with the default settings can also be used by older applications. But note that older applications cannot read the data format using absolute sharing. New York, 2006-06-28 Markus Mottl Jane Street Holding, LLC --------------------------------------------------------------------------- Index: byterun/extern.c =================================================================== RCS file: /caml/ocaml/byterun/extern.c,v retrieving revision 1.58.2.2 diff -u -r1.58.2.2 extern.c --- byterun/extern.c 10 Jun 2006 09:02:40 -0000 1.58.2.2 +++ byterun/extern.c 28 Jun 2006 21:59:08 -0000 @@ -35,6 +35,7 @@ static int extern_ignore_sharing; /* Flag to ignore sharing */ static int extern_closures; /* Flag to allow externing code pointers */ +static int extern_absolute_sharing; /* Flag to turn on absolute sharing */ /* Trail mechanism to undo forwarding pointers put inside objects */ @@ -334,7 +335,10 @@ } /* Check if already seen */ if (Color_hd(hd) == Caml_blue) { - uintnat d = obj_counter - (uintnat) Field(v, 0); + uintnat d = + extern_absolute_sharing + ? (uintnat) Field(v, 0) + : obj_counter - (uintnat) Field(v, 0); if (d < 0x100) { writecode8(CODE_SHARED8, d); } else if (d < 0x10000) { @@ -449,8 +453,8 @@ } } -enum { NO_SHARING = 1, CLOSURES = 2 }; -static int extern_flags[] = { NO_SHARING, CLOSURES }; +enum { NO_SHARING = 1, CLOSURES = 2, ABSOLUTE_SHARING = 4 }; +static int extern_flags[] = { NO_SHARING, CLOSURES, ABSOLUTE_SHARING }; static intnat extern_value(value v, value flags) { @@ -460,13 +464,18 @@ fl = caml_convert_flag_list(flags, extern_flags); extern_ignore_sharing = fl & NO_SHARING; extern_closures = fl & CLOSURES; + extern_absolute_sharing = fl & ABSOLUTE_SHARING; /* Initializations */ init_extern_trail(); obj_counter = 0; size_32 = 0; size_64 = 0; /* Write magic number */ - write32(Intext_magic_number); + write32( + extern_absolute_sharing + ? Intext_magic_number_absolute_sharing + : Intext_magic_number + ); /* Set aside space for the sizes */ extern_ptr += 4*4; /* Marshal the object */ Index: byterun/intern.c =================================================================== RCS file: /caml/ocaml/byterun/intern.c,v retrieving revision 1.60 diff -u -r1.60 intern.c --- byterun/intern.c 22 Sep 2005 14:21:50 -0000 1.60 +++ byterun/intern.c 28 Jun 2006 21:59:08 -0000 @@ -40,6 +40,9 @@ /* 1 if intern_input was allocated by caml_stat_alloc() and needs caml_stat_free() on error, 0 otherwise. */ +static int intern_absolute_sharing; +/* 1 if absolute sharing is used, 0 otherwise. */ + static header_t * intern_dest; /* Writing pointer in destination block */ @@ -187,7 +190,10 @@ Assert (ofs > 0); Assert (ofs <= obj_counter); Assert (intern_obj_table != NULL); - v = intern_obj_table[obj_counter - ofs]; + v = + intern_absolute_sharing + ? intern_obj_table[ofs] + : intern_obj_table[obj_counter - ofs]; break; case CODE_SHARED16: ofs = read16u(); @@ -387,6 +393,14 @@ } } +static void handle_magic(uint32 magic, char *msg) +{ + if (magic == Intext_magic_number) intern_absolute_sharing = 0; + else if (magic == Intext_magic_number_absolute_sharing) + intern_absolute_sharing = 1; + else caml_failwith(msg); +} + value caml_input_val(struct channel *chan) { uint32 magic; @@ -397,7 +411,7 @@ if (! caml_channel_binary_mode(chan)) caml_failwith("input_value: not a binary channel"); magic = caml_getword(chan); - if (magic != Intext_magic_number) caml_failwith("input_value: bad object"); + handle_magic(magic, "input_value: bad object"); block_len = caml_getword(chan); num_objects = caml_getword(chan); size_32 = caml_getword(chan); @@ -508,8 +522,7 @@ intern_src = intern_input + ofs; intern_input_malloced = 1; magic = read32u(); - if (magic != Intext_magic_number) - caml_failwith("input_value_from_malloc: bad object"); + handle_magic(magic, "input_value_from_malloc: bad object"); block_len = read32u(); obj = input_val_from_block(); /* Free the input */ @@ -527,8 +540,7 @@ intern_src = intern_input; intern_input_malloced = 0; magic = read32u(); - if (magic != Intext_magic_number) - caml_failwith("input_value_from_block: bad object"); + handle_magic(magic, "input_value_from_block: bad object"); block_len = read32u(); if (5*4 + block_len > len) caml_failwith("input_value_from_block: bad block length"); @@ -544,9 +556,7 @@ intern_src = &Byte_u(buff, Long_val(ofs)); intern_input_malloced = 0; magic = read32u(); - if (magic != Intext_magic_number){ - caml_failwith("Marshal.data_size: bad object"); - } + handle_magic(magic, "Marshal.data_size: bad object"); block_len = read32u(); return Val_long(block_len); } Index: byterun/intext.h =================================================================== RCS file: /caml/ocaml/byterun/intext.h,v retrieving revision 1.32 diff -u -r1.32 intext.h --- byterun/intext.h 22 Sep 2005 14:21:50 -0000 1.32 +++ byterun/intext.h 28 Jun 2006 21:59:08 -0000 @@ -30,6 +30,7 @@ /* Magic number */ #define Intext_magic_number 0x8495A6BE +#define Intext_magic_number_absolute_sharing 0xEB6A5948 /* Codes for the compact format */ Index: otherlibs/threads/marshal.ml =================================================================== RCS file: /caml/ocaml/otherlibs/threads/marshal.ml,v retrieving revision 1.9 diff -u -r1.9 marshal.ml --- otherlibs/threads/marshal.ml 27 May 2004 15:28:05 -0000 1.9 +++ otherlibs/threads/marshal.ml 28 Jun 2006 21:59:08 -0000 @@ -16,6 +16,7 @@ type extern_flags = No_sharing | Closures + | Absolute_sharing external to_string: 'a -> extern_flags list -> string = "caml_output_value_to_string" Index: stdlib/marshal.ml =================================================================== RCS file: /caml/ocaml/stdlib/marshal.ml,v retrieving revision 1.9 diff -u -r1.9 marshal.ml --- stdlib/marshal.ml 25 Oct 2005 18:34:07 -0000 1.9 +++ stdlib/marshal.ml 28 Jun 2006 21:59:08 -0000 @@ -16,6 +16,7 @@ type extern_flags = No_sharing | Closures + | Absolute_sharing external to_channel: out_channel -> 'a -> extern_flags list -> unit = "caml_output_value" Index: stdlib/marshal.mli =================================================================== RCS file: /caml/ocaml/stdlib/marshal.mli,v retrieving revision 1.14 diff -u -r1.14 marshal.mli --- stdlib/marshal.mli 25 Oct 2005 18:34:07 -0000 1.14 +++ stdlib/marshal.mli 28 Jun 2006 21:59:08 -0000 @@ -47,6 +47,7 @@ type extern_flags = No_sharing (** Don't preserve sharing *) | Closures (** Send function closures *) + | Absolute_sharing (** Use absolute addresses for sharing *) (** The flags to the [Marshal.to_*] functions below. *) val to_channel : out_channel -> 'a -> extern_flags list -> unit