caml-list - the Caml user's mailing list
 help / color / mirror / Atom feed
* Re: [Caml-list] Writing the function Set.map using first-class modules and 4.00 inference
       [not found]       ` <fa.YAoOn2iUcpHbr53eeBfGhPTDvtM@ifi.uio.no>
@ 2012-11-05  8:05         ` Radu Grigore
  2012-11-05 10:12           ` Gabriel Scherer
  0 siblings, 1 reply; 8+ messages in thread
From: Radu Grigore @ 2012-11-05  8:05 UTC (permalink / raw)
  To: fa.caml; +Cc: Gabriel Scherer, Caml List

Just to clarify what I meant, see the attached (hackish) benchmarks and patch.

-- begin benchmarks.ml --
(* Times (in seconds)

1000000 random lists of length 10
  of_list1 9.23
  of_list2 9.80
  S.of_list 10.15 (+3.6% compared to of_list2)

1 random list of length 10000000
  of_list1 Stack_overflow
  of_list2 102.39
  S.of_list 104.15 (+1.8% compared to of_list2)

1000000 sorted lists of length 10
  of_list1 11.14
  of_list2 12.51
  S.of_list 4.53 (-63% compared to of_list2)

1 sorted list of length 10000000
  of_list1 Stack_overflow
  of_list2 71.91
  S.of_list 6.73 (-90% compared to of_list2)

*)
open Printf

module S = Set.Make (struct type t = int let compare = compare end)

let of_list1 xs = List.fold_right S.add xs S.empty
let of_list2 xs = List.fold_left (fun x y -> S.add y x ) S.empty xs

let mk_list n =
  let xs = ref [] in
  for i = n downto 1 do xs := i (*Random.int max_int*) :: !xs done;
  !xs

let time s f xss =
  let a = Sys.time () in
  List.iter (fun xs -> ignore (f xs)) xss;
  let b = Sys.time () in
  printf "%s %.2f\n" s (b -. a)

let _ =
  Random.init 0;
  let n = 1 in
  let xss = ref [] in
  for i = 1 to n do xss := mk_list 10000000 :: !xss done;
(*   time "of_list1" of_list1 !xss; *)
  time "of_list2" of_list2 !xss;
  time "S.of_list" S.of_list !xss
-- end benchmarks.ml --



--begin of_list.patch--
Index: stdlib/moreLabels.mli
===================================================================
--- stdlib/moreLabels.mli	(revision 13061)
+++ stdlib/moreLabels.mli	(working copy)
@@ -155,6 +155,7 @@
       val partition : f:(elt -> bool) -> t -> t * t
       val cardinal : t -> int
       val elements : t -> elt list
+      val of_list : elt list -> t
       val min_elt : t -> elt
       val max_elt : t -> elt
       val choose : t -> elt
Index: stdlib/set.ml
===================================================================
--- stdlib/set.ml	(revision 13061)
+++ stdlib/set.ml	(working copy)
@@ -43,6 +43,7 @@
     val partition: (elt -> bool) -> t -> t * t
     val cardinal: t -> int
     val elements: t -> elt list
+    val of_list: elt list -> t
     val min_elt: t -> elt
     val max_elt: t -> elt
     val choose: t -> elt
@@ -343,6 +344,29 @@
         Empty -> accu
       | Node(l, v, r, _) -> elements_aux (v :: elements_aux accu r) l
 
+    exception Unsorted
+
+    let rec of_sorted_list n xs =
+      if n = 0 then (empty, xs) else begin
+        let l, xs = of_sorted_list (n / 2) xs in
+        let v, xs = (match xs with v :: xs -> v, xs | [] -> assert false) in
+        let r, xs = of_sorted_list (n - n / 2 - 1) xs in
+        (create l v r, xs)
+      end
+
+    let of_list = function
+        [] -> empty
+      | (x :: xs) as ys -> begin
+          let rec len n x = function
+              [] -> n
+            | z :: zs ->
+                if Ord.compare x z < 0
+                then len (n + 1) z zs
+                else raise Unsorted in
+          try let n = len 1 x xs in let set, _ = of_sorted_list n ys in set
+          with Unsorted -> List.fold_left (fun t elt -> add elt t) empty ys
+        end
+
     let elements s =
       elements_aux [] s
 
Index: stdlib/set.mli
===================================================================
--- stdlib/set.mli	(revision 13061)
+++ stdlib/set.mli	(working copy)
@@ -121,6 +121,9 @@
        to the ordering [Ord.compare], where [Ord] is the argument
        given to {!Set.Make}. *)
 
+    val of_list : elt list -> t
+    (** Makes a set out of a list. *)
+
     val min_elt: t -> elt
     (** Return the smallest element of the given set
        (with respect to the [Ord.compare] ordering), or raise
--end of_list.patch--

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [Caml-list] Writing the function Set.map using first-class modules and 4.00 inference
  2012-11-05  8:05         ` [Caml-list] Writing the function Set.map using first-class modules and 4.00 inference Radu Grigore
@ 2012-11-05 10:12           ` Gabriel Scherer
  2012-11-05 12:02             ` Radu Grigore
  0 siblings, 1 reply; 8+ messages in thread
From: Gabriel Scherer @ 2012-11-05 10:12 UTC (permalink / raw)
  To: Radu Grigore; +Cc: fa.caml, Caml List

You haven't tested the bad case, where the list is nearly sorted but
not exactly -- random lists are a good-case behavior as unsortedness
is detected very early. I don't expect the performances to be that bad
("len" avoids any allocation). Another problem is that you call the
user-defined comparison more than necessary, which can be problematic
if it is costly -- you can solve that by rather using of_sorted_list
to build the partial result list before raising Unsolved rather than
rebuilding it from the ground up.

I have played with such dynamic algorithm selection in the Map
implementation of Batteries¹, but I'm not really satisfied with it:
you end up making guesses about the user situation which are hard to
evaluate (we have no real-world-looking data to measure against) and
pay costs that a demanding user could request to skip by having
of_sorted_list and of_list exposed directly, and making the choice
herself.

¹: the use case was similar: there is an implementation of
non-functorized maps inherited from Extlib that some user find
convenient to use, where maps are records carrying their own
comparison operation, but then the implementation of binary functions
such as union/diff/intersect and merge are very different depending on
whether the two comparison operations provided are compatible or not.
  https://github.com/ocaml-batteries-team/batteries-included/blob/master/src/batMap.ml#L571

A note on API design: the type abstraction of Set and Map are
inconvenient when users want to implement better performing operations
on top of the balanced tree model. My personal opinion is that we
should preserve the abstraction of Set and Map, reject all operations
that rely on a representation assumption of sorted balanced trees
(unfortunately some of them, such as split, have already crept in),
and provide as a different module a "purely algorithmic" transparent
definition of balanced binary trees that users could extend easily and
on which they would build their own data structures. I insist on
having both abstract purpose-oriented modules and concrete
algorithm-oriented data structures, and not mixing them as is
currently done.

Not relying on ordered representations would be useful if we
considered moving to a different representation for sets and maps.
Thibault Suzanne has been experimenting with Hash Array Mapped Tries
(HAMT) in OCaml and the results are relatively promising, but this
implementation would have a hard time transparently replacing stdlib's
Set and Map because of their exposed balanced implementation details.
  https://gitorious.org/ocaml-hamt


On Mon, Nov 5, 2012 at 9:05 AM, Radu Grigore <radugrigore@gmail.com> wrote:
> Just to clarify what I meant, see the attached (hackish) benchmarks and patch.
>
> -- begin benchmarks.ml --
> (* Times (in seconds)
>
> 1000000 random lists of length 10
>   of_list1 9.23
>   of_list2 9.80
>   S.of_list 10.15 (+3.6% compared to of_list2)
>
> 1 random list of length 10000000
>   of_list1 Stack_overflow
>   of_list2 102.39
>   S.of_list 104.15 (+1.8% compared to of_list2)
>
> 1000000 sorted lists of length 10
>   of_list1 11.14
>   of_list2 12.51
>   S.of_list 4.53 (-63% compared to of_list2)
>
> 1 sorted list of length 10000000
>   of_list1 Stack_overflow
>   of_list2 71.91
>   S.of_list 6.73 (-90% compared to of_list2)
>
> *)
> open Printf
>
> module S = Set.Make (struct type t = int let compare = compare end)
>
> let of_list1 xs = List.fold_right S.add xs S.empty
> let of_list2 xs = List.fold_left (fun x y -> S.add y x ) S.empty xs
>
> let mk_list n =
>   let xs = ref [] in
>   for i = n downto 1 do xs := i (*Random.int max_int*) :: !xs done;
>   !xs
>
> let time s f xss =
>   let a = Sys.time () in
>   List.iter (fun xs -> ignore (f xs)) xss;
>   let b = Sys.time () in
>   printf "%s %.2f\n" s (b -. a)
>
> let _ =
>   Random.init 0;
>   let n = 1 in
>   let xss = ref [] in
>   for i = 1 to n do xss := mk_list 10000000 :: !xss done;
> (*   time "of_list1" of_list1 !xss; *)
>   time "of_list2" of_list2 !xss;
>   time "S.of_list" S.of_list !xss
> -- end benchmarks.ml --
>
>
>
> --begin of_list.patch--
> Index: stdlib/moreLabels.mli
> ===================================================================
> --- stdlib/moreLabels.mli       (revision 13061)
> +++ stdlib/moreLabels.mli       (working copy)
> @@ -155,6 +155,7 @@
>        val partition : f:(elt -> bool) -> t -> t * t
>        val cardinal : t -> int
>        val elements : t -> elt list
> +      val of_list : elt list -> t
>        val min_elt : t -> elt
>        val max_elt : t -> elt
>        val choose : t -> elt
> Index: stdlib/set.ml
> ===================================================================
> --- stdlib/set.ml       (revision 13061)
> +++ stdlib/set.ml       (working copy)
> @@ -43,6 +43,7 @@
>      val partition: (elt -> bool) -> t -> t * t
>      val cardinal: t -> int
>      val elements: t -> elt list
> +    val of_list: elt list -> t
>      val min_elt: t -> elt
>      val max_elt: t -> elt
>      val choose: t -> elt
> @@ -343,6 +344,29 @@
>          Empty -> accu
>        | Node(l, v, r, _) -> elements_aux (v :: elements_aux accu r) l
>
> +    exception Unsorted
> +
> +    let rec of_sorted_list n xs =
> +      if n = 0 then (empty, xs) else begin
> +        let l, xs = of_sorted_list (n / 2) xs in
> +        let v, xs = (match xs with v :: xs -> v, xs | [] -> assert false) in
> +        let r, xs = of_sorted_list (n - n / 2 - 1) xs in
> +        (create l v r, xs)
> +      end
> +
> +    let of_list = function
> +        [] -> empty
> +      | (x :: xs) as ys -> begin
> +          let rec len n x = function
> +              [] -> n
> +            | z :: zs ->
> +                if Ord.compare x z < 0
> +                then len (n + 1) z zs
> +                else raise Unsorted in
> +          try let n = len 1 x xs in let set, _ = of_sorted_list n ys in set
> +          with Unsorted -> List.fold_left (fun t elt -> add elt t) empty ys
> +        end
> +
>      let elements s =
>        elements_aux [] s
>
> Index: stdlib/set.mli
> ===================================================================
> --- stdlib/set.mli      (revision 13061)
> +++ stdlib/set.mli      (working copy)
> @@ -121,6 +121,9 @@
>         to the ordering [Ord.compare], where [Ord] is the argument
>         given to {!Set.Make}. *)
>
> +    val of_list : elt list -> t
> +    (** Makes a set out of a list. *)
> +
>      val min_elt: t -> elt
>      (** Return the smallest element of the given set
>         (with respect to the [Ord.compare] ordering), or raise
> --end of_list.patch--

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [Caml-list] Writing the function Set.map using first-class modules and 4.00 inference
  2012-11-05 10:12           ` Gabriel Scherer
@ 2012-11-05 12:02             ` Radu Grigore
  0 siblings, 0 replies; 8+ messages in thread
From: Radu Grigore @ 2012-11-05 12:02 UTC (permalink / raw)
  To: Gabriel Scherer; +Cc: fa.caml, Caml List

On Mon, Nov 5, 2012 at 10:12 AM, Gabriel Scherer
<gabriel.scherer@gmail.com> wrote:
> You haven't tested the bad case, where the list is nearly sorted but
> not exactly

It's 3.7% slower, rather than 3.6% for random lists. (But, you
shouldn't be calling it "*the* bad case".)

> Another problem is that you call the
> user-defined comparison more than necessary, [...]

I would note only that the "necessary" number of comparisons is not
really achieved with your optimization, for the simple reason that the
minimum number of comparisons is hard to compute. [1]

> pay costs that a demanding user could request to skip by having
> of_sorted_list and of_list exposed directly, and making the choice
> herself.

I'm not sure what costs you are talking about:
(1) If it's the case of sorted lists, then that user is already paying
a much bigger price now.
(2) It it's the case of non-sorted lists, then the demanding user can
always do what they do now.

> A note on API design: [...] My personal opinion is that we
> should preserve the abstraction of Set and Map, reject all operations
> that rely on a representation assumption of sorted balanced trees
> (unfortunately some of them, such as split, have already crept in),

The (of_list : elt list -> t) function relies on no assumption about
how the set is implemented. With *any* reasonable implementation of a
set, you should be able to build it from a list. The only reason to
*not* put it in the interface would be if the user could write an
implementation that is just as good. But they can't.

[1] http://oeis.org/A036604

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [Caml-list] Writing the function Set.map using first-class modules and 4.00 inference
  2012-11-02 16:00     ` Gabriel Scherer
@ 2012-11-02 16:21       ` Radu Grigore
  0 siblings, 0 replies; 8+ messages in thread
From: Radu Grigore @ 2012-11-02 16:21 UTC (permalink / raw)
  To: Gabriel Scherer; +Cc: Caml List

On Fri, Nov 2, 2012 at 4:00 PM, Gabriel Scherer
<gabriel.scherer@gmail.com> wrote:
> But when you map from an ASet to a BSet, there is no reason to assume
> in general that the mapping function preserves the order,

Oops.

> I don't think you can do without the O(n log n) worst case bound.

Probably not.

> I'm not sure order-preserving mappings happen that often in practice.

I can't remember a single instance where I needed to map between sets,
order-preserving or not.

I do, however, often make sets out of lists. In a few cases I know
that the list is sorted by construction. I would like to have
[of_sorted_list]. Or (better?) an [of_list] that works in O(n) for
sorted lists and falls back to O(n lg n) when the list isn't already
sorted. This can't be done without messing with Set's internals, which
is why I think it should be in the interface.

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [Caml-list] Writing the function Set.map using first-class modules and 4.00 inference
  2012-11-02 15:11   ` Radu Grigore
@ 2012-11-02 16:00     ` Gabriel Scherer
  2012-11-02 16:21       ` Radu Grigore
  0 siblings, 1 reply; 8+ messages in thread
From: Gabriel Scherer @ 2012-11-02 16:00 UTC (permalink / raw)
  To: Radu Grigore; +Cc: Caml List

But when you map from an ASet to a BSet, there is no reason to assume
in general that the mapping function preserves the order, so I don't
think you can do without the O(n log n) worst case bound. You could
provide a specialized function that assumes that the mapping is
order-preserving (you could even check it at runtime with no
additional complexity cost, for added safety), but I'm not sure
order-preserving mappings happen that often in practice.

On Fri, Nov 2, 2012 at 4:11 PM, Radu Grigore <radugrigore@gmail.com> wrote:
> On Friday, November 2, 2012 2:59:17 PM UTC, Radu Grigore wrote:
>> Somebody should put the linear time [of_list] in the standard library.
>
> Actually, that should probably be called [of_sorted_list], to make it clear what it does.  If you worry about having a function that requires a sorted list, then note that there is already one in the standard library: List.merge.
>
> --
> Caml-list mailing list.  Subscription management and archives:
> https://sympa.inria.fr/sympa/arc/caml-list
> Beginner's list: http://groups.yahoo.com/group/ocaml_beginners
> Bug reports: http://caml.inria.fr/bin/caml-bugs

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [Caml-list] Writing the function Set.map using first-class modules and 4.00 inference
  2012-11-02 14:59 ` Radu Grigore
@ 2012-11-02 15:11   ` Radu Grigore
  2012-11-02 16:00     ` Gabriel Scherer
  0 siblings, 1 reply; 8+ messages in thread
From: Radu Grigore @ 2012-11-02 15:11 UTC (permalink / raw)
  To: fa.caml; +Cc: Caml List

On Friday, November 2, 2012 2:59:17 PM UTC, Radu Grigore wrote:
> Somebody should put the linear time [of_list] in the standard library.

Actually, that should probably be called [of_sorted_list], to make it clear what it does.  If you worry about having a function that requires a sorted list, then note that there is already one in the standard library: List.merge.

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [Caml-list] Writing the function Set.map using first-class modules and 4.00 inference
       [not found] <fa.FUGAe9RTYsSPA4RwbpKO14B0oVo@ifi.uio.no>
@ 2012-11-02 14:59 ` Radu Grigore
  2012-11-02 15:11   ` Radu Grigore
  0 siblings, 1 reply; 8+ messages in thread
From: Radu Grigore @ 2012-11-02 14:59 UTC (permalink / raw)
  To: fa.caml; +Cc: Caml List

Nice.  I'll show you an alternative.

In your solution you write
  let chrs = CharSet.map (module StringSet) strs (fun s -> s.[0])
In my solution you write
  let chrs = CharSet.of_list (StringSet.map_l strs (fun s -> s.[0]))
It has about the same length.

The disadvantage of my solution is that the type of map_l is weird
  StringSet.map_l : StringSet.t -> (string -> 'a) -> 'a list

The advantages are
1. It is possible to implement in O(n), rather than O(n lg n)
2. Works in 3.12.
Somebody should put the linear time [of_list] in the standard library.

Here is a simple, O(n lg n) implementation, similar to yours.

  module FunkySet = struct
    module type OrderedType = Set.OrderedType
    module type S = Set.S
    module Make (E : OrderedType) = struct
      include Set.Make (E)
      let of_list xs = List.fold_right add xs empty
      let map_l s f = List.map f (elements s)
    end
  end

^ permalink raw reply	[flat|nested] 8+ messages in thread

* [Caml-list] Writing the function Set.map using first-class modules and 4.00 inference
@ 2012-11-01 21:00 Jeff Meister
  0 siblings, 0 replies; 8+ messages in thread
From: Jeff Meister @ 2012-11-01 21:00 UTC (permalink / raw)
  To: Caml List

[-- Attachment #1: Type: text/plain, Size: 2389 bytes --]

I found an interesting (to me, anyway) use of OCaml's first-class modules,
and particularly the new 4.00 type inference features, which I thought was
worth sharing with the list. This has probably been observed by someone
else already, but I haven't seen it discussed.

In the OCaml standard library, the polymorphic set data structure is
implemented as a functor, which takes a module containing a type t and
total ordering function over t, and returns a module representing sets
whose elements have type t. Like so:

module StringSet = Set.Make(String)
module CharSet = Set.Make(Char)

One disadvantage of this method is that once the functor has been called,
the type of the set elements is fixed. As a consequence, OCaml's set
interface has no map function. If we had a polymorphic type like 'a set,
this function would have type 'a set -> ('a -> 'b) -> 'b set. But
StringSet.t and CharSet.t are not polymorphic; the corresponding type elt
in each module cannot be changed.

However, using first-class modules, we can write a function map for sets,
which takes as an extra argument the packaged module representing the set
we're mapping from. Maybe this function is better called map_from. Check it
out:

# module Set = struct
    module type OrderedType = Set.OrderedType
    module type S = Set.S
    module Make(Ord : OrderedType) = struct
      include Set.Make(Ord)
      let map (type e') (type t') (module OtherSet : S with type elt = e'
and type t = t') os f =
       OtherSet.fold (fun x accu -> add (f x) accu) os empty
   end
 end;;
[... bunch of output ...]
val map :
  (module S with type elt = 'a and type t = 'b) ->
  'b -> ('a -> elt) -> t

Now, back in OCaml 3.12, this function could be written (without the nice
package-expanding pattern I've made use of), but calling it was quite a
pain, enough to invalidate the whole enterprise. One would have to type
this:

# let strs = StringSet.(add "foo" (add "bar" empty));;
val strs : StringSet.t = <abstr>
# let chrs = CharSet.map (module StringSet : Set.S with type elt =
StringSet.elt and type t = StringSet.t) strs (fun s -> s.[0]);;
val chrs : CharSet.t = <abstr>

It's much easier with the type inference changes in OCaml 4.00:

# let strs = StringSet.(add "foo" (add "bar" empty));;
val strs : StringSet.t = <abstr>
# let chrs = CharSet.map (module StringSet) strs (fun s -> s.[0]);;
val chrs : CharSet.t = <abstr>

[-- Attachment #2: Type: text/html, Size: 2693 bytes --]

^ permalink raw reply	[flat|nested] 8+ messages in thread

end of thread, other threads:[~2012-11-05 12:02 UTC | newest]

Thread overview: 8+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
     [not found] <fa.yP8czrxqEGCABee05wzAlISIlN4@ifi.uio.no>
     [not found] ` <fa.rve4x46ugIvZs4BizovvjOCeTG0@ifi.uio.no>
     [not found]   ` <fa.i9E+7rAvghoTZ1MXH4g+uL4dpCY@ifi.uio.no>
     [not found]     ` <fa.AX0JYJa8kXsH/YSZ08tuyBc8m+0@ifi.uio.no>
     [not found]       ` <fa.YAoOn2iUcpHbr53eeBfGhPTDvtM@ifi.uio.no>
2012-11-05  8:05         ` [Caml-list] Writing the function Set.map using first-class modules and 4.00 inference Radu Grigore
2012-11-05 10:12           ` Gabriel Scherer
2012-11-05 12:02             ` Radu Grigore
     [not found] <fa.FUGAe9RTYsSPA4RwbpKO14B0oVo@ifi.uio.no>
2012-11-02 14:59 ` Radu Grigore
2012-11-02 15:11   ` Radu Grigore
2012-11-02 16:00     ` Gabriel Scherer
2012-11-02 16:21       ` Radu Grigore
2012-11-01 21:00 Jeff Meister

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).