[Caml-list] Linear Scan Register Allocator for ocamlopt/ocamlnat

caml-list - the Caml user's mailing list
 help / color / mirror / Atom feed

* [Caml-list] Linear Scan Register Allocator for ocamlopt/ocamlnat
@ 2011-08-01 14:53 Benedikt Meurer
  2011-08-01 15:03 ` [Caml-list] " Benedikt Meurer
                   ` (2 more replies)
  0 siblings, 3 replies; 20+ messages in thread
From: Benedikt Meurer @ 2011-08-01 14:53 UTC (permalink / raw)
  To: caml-list; +Cc: Marcell Fischbach

[-- Attachment #1: Type: text/plain, Size: 1226 bytes --]

Hello,

As mentioned earlier we have a student working on an implementation of the Linear Scan Register Allocator [1] for ocamlopt (and thereby ocamlnat). It took some time, but now there's a first working patch which looks promising. This work is done by Marcell Fischbach as part of his diploma thesis. The idea is to use the linear scan algorithm to drive the register allocation in the native top-level ocamlnat at some point, as suggested by Fabrice Le Fessant [2].

Marcell is now working to implement a proof-of-concept of an inline assembler for ocamlnat on i386 based on code from Alain Frisch an Fabrice Le Fessant. The result will also be contributed once ready, and will be used to effectively compare ocamlnat and the byte-code ocaml top-level.

The linear scan implementation reuses as much of the existing ocamlopt functionality as possible, so additional maintenance overhead should be manageable. Comments and suggestions are welcome of course. Please keep Marcell CC'ed with any replies as he's not subscribed to the list.

greets,
Benedikt


[1] http://portal.acm.org/citation.cfm?id=330250
[2] http://caml.inria.fr/pub/ml-archives/caml-list/2010/11/a1b0ed0934ff51df4ac07c5e9da6e9d6.en.html


[-- Attachment #2: ocaml-linear-scan-20110801.diff --]
[-- Type: application/octet-stream, Size: 32036 bytes --]

Index: driver/main_args.mli
===================================================================
--- driver/main_args.mli	(Revision 2)
+++ driver/main_args.mli	(Revision 27)
@@ -142,6 +142,8 @@
   val _warn_help : unit -> unit
   val _where : unit -> unit
 
+  val _linscan : unit -> unit
+
   val _nopervasives : unit -> unit
   val _dparsetree : unit -> unit
   val _drawlambda : unit -> unit
@@ -158,6 +160,7 @@
   val _dreload : unit -> unit
   val _dscheduling :  unit -> unit
   val _dlinear :  unit -> unit
+  val _dinterval :  unit -> unit
   val _dstartup :  unit -> unit
 
   val anonymous : string -> unit
@@ -185,6 +188,8 @@
   val _warn_error : string -> unit
   val _warn_help : unit -> unit
 
+  val _linscan : unit -> unit
+
   val _dparsetree : unit -> unit
   val _drawlambda : unit -> unit
   val _dlambda : unit -> unit
@@ -200,6 +205,7 @@
   val _dreload : unit -> unit
   val _dscheduling :  unit -> unit
   val _dlinear :  unit -> unit
+  val _dinterval :  unit -> unit
   val _dstartup :  unit -> unit
 
   val anonymous : string -> unit
Index: driver/main_args.ml
===================================================================
--- driver/main_args.ml	(Revision 2)
+++ driver/main_args.ml	(Revision 27)
@@ -298,6 +298,11 @@
   "-use-prims", Arg.String f, "<file>  (undocumented)"
 ;;
 
+let mk_linscan f =
+  "-linscan", Arg.Unit f, " (undocumented)"
+;;
+
+
 let mk_dparsetree f =
   "-dparsetree", Arg.Unit f, " (undocumented)"
 ;;
@@ -362,6 +367,11 @@
   "-dlinear", Arg.Unit f, " (undocumented)"
 ;;
 
+let mk_dinterval f =
+  "-dinterval", Arg.Unit f, " (undocumented)"
+;;
+
+
 let mk_dstartup f =
   "-dstartup", Arg.Unit f, " (undocumented)"
 ;;
@@ -499,6 +509,8 @@
   val _warn_help : unit -> unit
   val _where : unit -> unit
 
+  val _linscan : unit -> unit
+
   val _nopervasives : unit -> unit
   val _dparsetree : unit -> unit
   val _drawlambda : unit -> unit
@@ -515,6 +527,7 @@
   val _dreload : unit -> unit
   val _dscheduling :  unit -> unit
   val _dlinear :  unit -> unit
+  val _dinterval :  unit -> unit
   val _dstartup :  unit -> unit
 
   val anonymous : string -> unit
@@ -542,6 +555,8 @@
   val _warn_error : string -> unit
   val _warn_help : unit -> unit
 
+  val _linscan : unit -> unit
+
   val _dparsetree : unit -> unit
   val _drawlambda : unit -> unit
   val _dlambda : unit -> unit
@@ -557,6 +572,7 @@
   val _dreload : unit -> unit
   val _dscheduling :  unit -> unit
   val _dlinear :  unit -> unit
+  val _dinterval :  unit -> unit
   val _dstartup :  unit -> unit
 
   val anonymous : string -> unit
@@ -709,6 +725,8 @@
     mk_warn_help F._warn_help;
     mk_where F._where;
 
+    mk_linscan F._linscan;
+
     mk_nopervasives F._nopervasives;
     mk_dparsetree F._dparsetree;
     mk_drawlambda F._drawlambda;
@@ -718,12 +736,14 @@
     mk_dcombine F._dcombine;
     mk_dlive F._dlive;
     mk_dspill F._dspill;
+    mk_dsplit F._dspill;
     mk_dinterf F._dinterf;
     mk_dprefer F._dprefer;
     mk_dalloc F._dalloc;
     mk_dreload F._dreload;
     mk_dscheduling F._dscheduling;
     mk_dlinear F._dlinear;
+    mk_dinterval F._dinterval;
     mk_dstartup F._dstartup;
 
     mk__ F.anonymous;
@@ -753,6 +773,8 @@
     mk_warn_error F._warn_error;
     mk_warn_help F._warn_help;
 
+    mk_linscan F._linscan;
+
     mk_dparsetree F._dparsetree;
     mk_drawlambda F._drawlambda;
     mk_dcmm F._dcmm;
@@ -760,12 +782,14 @@
     mk_dcombine F._dcombine;
     mk_dlive F._dlive;
     mk_dspill F._dspill;
+    mk_dsplit F._dspill;
     mk_dinterf F._dinterf;
     mk_dprefer F._dprefer;
     mk_dalloc F._dalloc;
     mk_dreload F._dreload;
     mk_dscheduling F._dscheduling;
     mk_dlinear F._dlinear;
+    mk_dinterval F._dinterval;
     mk_dstartup F._dstartup;
 
     mk__ F.anonymous;
Index: driver/optmain.ml
===================================================================
--- driver/optmain.ml	(Revision 2)
+++ driver/optmain.ml	(Revision 27)
@@ -142,6 +142,8 @@
   let _warn_help = Warnings.help_warnings
   let _where () = print_standard_library ()
 
+  let _linscan = set use_linscan
+
   let _nopervasives = set nopervasives
   let _dparsetree = set dump_parsetree
   let _drawlambda = set dump_rawlambda
@@ -158,6 +160,7 @@
   let _dreload = set dump_reload
   let _dscheduling = set dump_scheduling
   let _dlinear = set dump_linear
+  let _dinterval = set dump_interval
   let _dstartup = set keep_startup_file
 
   let anonymous = anonymous
Index: asmcomp/interval.mli
===================================================================
--- asmcomp/interval.mli	(Revision 0)
+++ asmcomp/interval.mli	(Revision 27)
@@ -0,0 +1,40 @@
+(***********************************************************************)
+(*                                                                     *)
+(*                           Objective Caml                            *)
+(*                                                                     *)
+(*                          Marcell Fischbach                          *)
+(*                                                                     *)
+(*  Copyright 2011 University of Siegen. All rights reserved.          *)
+(*  This file is distributed  under the terms of the                   *)
+(*  Q Public License version 1.0.                                      *)
+(*                                                                     *)
+(***********************************************************************)
+
+
+
+open Format
+
+ 
+type range =
+  {
+    mutable rbegin : int;
+    mutable rend : int;
+  }
+
+type interval = 
+  { 
+      mutable reg : Reg.t;
+      mutable ibegin : int;
+      mutable iend : int;
+      mutable ranges : range list;
+  }
+
+
+val all_intervals : unit -> interval list
+val all_fixed_intervals: unit -> interval list
+val debug_intervals: formatter ->  Mach.fundecl -> unit
+val build_intervals: Mach.fundecl -> unit
+val live_on: interval -> int -> bool
+val overlapping_ranges: range -> range -> bool
+val overlapping: interval -> interval -> bool
+val strip_expired_ranges: range list -> int -> range list
Index: asmcomp/asmgen.ml
===================================================================
--- asmcomp/asmgen.ml	(Revision 2)
+++ asmcomp/asmgen.ml	(Revision 27)
@@ -20,6 +20,8 @@
 open Misc
 open Cmm
 
+external sys_time : unit -> float = "caml_sys_time"
+
 type error = Assembler_error of string
 
 exception Error of error
@@ -38,21 +40,53 @@
   phrase
 
 let rec regalloc ppf round fd =
-  if round > 50 then
-    fatal_error(fd.Mach.fun_name ^
-                ": function too complex, cannot complete register allocation");
-  dump_if ppf dump_live "Liveness analysis" fd;
-  Interf.build_graph fd;
-  if !dump_interf then Printmach.interferences ppf ();
-  if !dump_prefer then Printmach.preferences ppf ();
-  Coloring.allocate_registers();
-  dump_if ppf dump_regalloc "After register allocation" fd;
-  let (newfd, redo_regalloc) = Reload.fundecl fd in
-  dump_if ppf dump_reload "After insertion of reloading code" newfd;
-  if redo_regalloc then begin
-    Reg.reinit(); Liveness.fundecl ppf newfd; regalloc ppf (round + 1) newfd
-  end else newfd
+  if not !use_linscan then begin
+    if round > 50 then
+      fatal_error(fd.Mach.fun_name ^
+                  ": function too complex, cannot complete register allocation");
+    dump_if ppf dump_live "Liveness analysis" fd;
+    Interf.build_graph fd;
+    if !dump_interf then Printmach.interferences ppf ();
+    if !dump_prefer then Printmach.preferences ppf ();
+    Coloring.allocate_registers();
+    dump_if ppf dump_regalloc "After register allocation" fd;
+    let (newfd, redo_regalloc) = Reload.fundecl fd in
+    if redo_regalloc then begin
+      Reg.reinit(); 
+      Liveness.fundecl ppf newfd; 
+      dump_if ppf dump_reload "After insertion of reloading code" newfd;
+      regalloc ppf (round + 1) newfd
+    end 
+    else 
+    begin
+      dump_if ppf dump_reload "After insertion of reloading code" newfd;
+      newfd
+    end
+  end
+  else
+    fd
 
+let rec regalloc_linscan ppf round fd =
+  if !use_linscan then begin
+    if round > 50 then
+      fatal_error(fd.Mach.fun_name ^
+                  ": function too complex, cannot complete register allocation");
+
+    Interval.build_intervals fd;
+    if !dump_interval then Interval.debug_intervals ppf fd;
+    Linscan.walk_intervals (Interval.all_intervals ()) (Interval.all_fixed_intervals()) fd;
+    let (newfd, redo_regalloc) = Reload.fundecl fd in
+    dump_if ppf dump_reload "After insertion of reloading code" newfd;
+    if redo_regalloc then begin
+      Reg.reinit(); 
+      Liveness.fundecl ppf newfd; 
+      regalloc_linscan ppf (round + 1) newfd
+    end else newfd
+  end
+  else
+    fd
+
+
 let (++) x f = f x
 
 let compile_fundecl (ppf : formatter) fd_cmm =
@@ -70,7 +104,8 @@
   ++ Split.fundecl
   ++ pass_dump_if ppf dump_split "After live range splitting"
   ++ liveness ppf
-  ++ regalloc ppf 1
+  ++ regalloc ppf 1 
+  ++ regalloc_linscan ppf 1
   ++ Linearize.fundecl
   ++ pass_dump_linear_if ppf dump_linear "Linearized code"
   ++ Scheduling.fundecl
Index: asmcomp/interval.ml
===================================================================
--- asmcomp/interval.ml	(Revision 0)
+++ asmcomp/interval.ml	(Revision 27)
@@ -0,0 +1,290 @@
+(***********************************************************************)
+(*                                                                     *)
+(*                           Objective Caml                            *)
+(*                                                                     *)
+(*                          Marcell Fischbach                          *)
+(*                                                                     *)
+(*  Copyright 2011 University of Siegen. All rights reserved.          *)
+(*  This file is distributed  under the terms of the                   *)
+(*  Q Public License version 1.0.                                      *)
+(*                                                                     *)
+(***********************************************************************)
+
+
+
+open List
+open Mach
+open Reg
+
+type range =
+  {
+    mutable rbegin : int;
+    mutable rend : int;
+  }
+
+type interval = 
+  { 
+      mutable reg : Reg.t;
+      mutable ibegin : int;
+      mutable iend : int;
+      mutable ranges : range list;
+  }
+
+
+let interval_list = ref ([] : interval list)
+let fixed_interval_list = ref ([] : interval list)
+let all_intervals() = !interval_list
+let all_fixed_intervals() = !fixed_interval_list
+
+let overlapping_ranges r0 r1 =
+  r0.rend > r1.rbegin && r1.rend > r0.rbegin
+
+
+let overlapping i0 i1 =
+
+  let rec test_ranges r0s r1s =
+    begin match r0s, r1s with
+    | [], _ -> false
+    | _, [] -> false
+    | r0::r0tl, r1::r1tl ->
+      if overlapping_ranges r0 r1 then true
+      else if r0.rend < r1.rend then test_ranges r0tl r1s
+      else if r0.rend > r1.rend then test_ranges r0s r1tl
+      else test_ranges r0tl r1tl
+    end 
+  in
+
+  test_ranges i0.ranges i1.ranges
+
+let live_on i p = 
+  let rec live_on_ranges r =
+    begin match r with
+    | [] -> false
+    | hd::tl -> 
+          if p < hd.rbegin then false
+          else if p < hd.rend then true
+          else live_on_ranges tl
+    end in
+  live_on_ranges i.ranges
+ 
+
+let rec strip_expired_ranges ranges pos =
+  begin match ranges with
+  | [] -> []
+  | hd::tl ->
+      if hd.rend > pos then ranges
+      else strip_expired_ranges tl pos 
+  end
+
+  
+
+
+let debug_intervals ppf fd =
+  Format.fprintf ppf "*** Intervals\n";
+  Format.fprintf ppf "%s\n" fd.fun_name;
+
+  let dump_interval i =
+      Format.fprintf ppf "  ";
+      Printmach.reg ppf i.reg;
+      List.iter (fun r ->
+	      Format.fprintf ppf " [%d;%d[ " r.rbegin r.rend
+	    ) i.ranges;
+      Format.fprintf ppf "\n"
+  in
+  List.iter dump_interval !fixed_interval_list;
+  List.iter dump_interval !interval_list;
+  ()
+
+
+let get_and_initialize_interval intervals reg pos_tst pos_set use_kind =
+  let interval = intervals.(reg.stamp) in
+  if interval.iend = 0 then begin
+    interval.ibegin <- pos_tst;
+    interval.iend <- pos_set;
+    interval.reg <- reg;
+    interval.ranges <- [{rbegin = pos_tst; rend = pos_set; }]
+  end;
+  interval
+   
+
+let update_interval_position intervals pos_tst pos_set use_kind reg =
+  let interval = get_and_initialize_interval intervals reg pos_tst pos_set use_kind in
+  let range = begin match interval.ranges with |[] -> Misc.fatal_error "Illegal empty range" | hd::_ -> hd end in
+
+  interval.iend <- pos_set;
+
+  if (range.rend = pos_tst || (range.rend + 1) = pos_tst) && use_kind != 1 then
+    range.rend <- pos_set
+  else if range.rbegin = pos_tst && range.rend = pos_tst && use_kind = 1 then
+    range.rend <- pos_set
+  else 
+    interval.ranges <- {rbegin=pos_tst;rend=pos_set;} :: interval.ranges
+
+
+
+let update_interval_position_by_reg_array intervals regs pos_tst pos_set use_kind =
+  Array.iter (update_interval_position intervals  pos_tst pos_set use_kind) regs
+         
+let update_interval_position_by_reg_set intervals regs pos_tst pos_set use_kind =
+  Set.iter (update_interval_position intervals  pos_tst pos_set use_kind) regs
+ 
+let update_interval_position_by_instr intervals instr pos_tst pos_set =
+  update_interval_position_by_reg_array intervals instr.arg pos_tst pos_set 0;
+  update_interval_position_by_reg_array intervals instr.res pos_set pos_set 1;
+  update_interval_position_by_reg_set intervals instr.live pos_tst pos_set 0
+
+
+let insert_pos_for_live intervals instr pos = 
+  if (not (Set.is_empty instr.live)) || Array.length instr.arg > 0 then
+  begin
+    pos := succ !pos;
+    update_interval_position_by_reg_set intervals instr.live !pos !pos 0;
+    update_interval_position_by_reg_array intervals instr.arg !pos !pos 0
+  end
+
+let insert_destroyed_at_oper intervals instr pos =
+  let destroyed = Proc.destroyed_at_oper instr.desc in
+  if Array.length destroyed > 0 then
+    update_interval_position_by_reg_array intervals destroyed pos pos 1 
+
+let insert_destroyed_at_raise intervals pos = 
+  let destroyed = Proc.destroyed_at_raise in
+  if Array.length destroyed > 0 then
+    update_interval_position_by_reg_array intervals destroyed pos pos 1
+
+
+(* generate all intervals.
+   the intervals will be expanded by one step at the beginning and
+   the ending of a basic block
+*)
+let build_intervals fundecl =
+
+  let intervals = Array.init (Reg.num_registers()) (fun i ->
+    { reg = Reg.dummy;
+      ibegin = 0;
+      iend = 0;
+      ranges = [];
+    }) in
+
+
+  let rec walk_instruction i pos shift =
+    pos := !pos + 1 + shift;
+    update_interval_position_by_instr intervals i (!pos - shift)  !pos;
+
+    
+    begin match i.desc with
+    | Iend -> 
+        (* end ends a bb *)
+        insert_pos_for_live intervals i pos;     
+
+    | Iop(Icall_ind | Icall_imm _ | Iextcall(_, true) | Itailcall_ind | Itailcall_imm _) ->
+        walk_instruction i.next pos 0 
+
+    | Iop _ ->
+        insert_destroyed_at_oper intervals i !pos;
+        walk_instruction i.next pos 0
+
+    | Ireturn ->
+        insert_destroyed_at_oper intervals i !pos;
+        (* returns ends a bb *)
+        insert_pos_for_live intervals i pos;     
+        walk_instruction i.next pos 0
+
+
+    | Iifthenelse(test, ifso, ifnot) ->
+        insert_destroyed_at_oper intervals i !pos;
+        (* if ends a bb *)
+        insert_pos_for_live intervals i pos;
+
+        (* ifso starts a new bb *)
+        walk_instruction ifso pos 1;
+
+        (* ifnot starts a new bb *)
+        walk_instruction ifnot pos 1;
+
+        (* next starts a new bb *)
+        walk_instruction i.next pos 1
+    | Iswitch(index, cases) ->
+        insert_destroyed_at_oper intervals i !pos;
+        (* switch ends a bb *)
+        insert_pos_for_live intervals i pos;
+
+        for j = 0 to Array.length cases -1 do
+          (* each case starts a new bb *)
+          walk_instruction cases.(j) pos 1
+        done;
+        (* next starts a new bb *)
+        walk_instruction i.next pos 1
+    | Iloop body ->
+        insert_destroyed_at_oper intervals i !pos;
+        (* loop ends a bb *)
+        insert_pos_for_live intervals i pos;
+
+        (* the body starts a new block *)
+        walk_instruction body pos 1;
+
+        (* next starts a new bb *)
+        walk_instruction i.next pos 1
+    | Icatch(io, body, handler) ->
+        insert_destroyed_at_oper intervals i !pos;
+        (* catch ends a bb *)
+        insert_pos_for_live intervals i pos;
+
+        (* the body starts a new bb *)
+        walk_instruction body pos 1;
+
+        (* the handler starts a new bb *)
+        walk_instruction handler pos 1;
+
+        (* next starts a new bb *)
+        walk_instruction i.next pos 1;
+    | Iexit nfail ->
+        insert_destroyed_at_oper intervals i !pos;
+        (* exit ends a bb *)
+        insert_pos_for_live intervals i pos;
+
+    | Itrywith(body, handler) ->
+        insert_destroyed_at_oper intervals i !pos;
+        (* trywith ends a bb *)
+        insert_pos_for_live intervals i pos;
+
+        (* the body starts a new bb *)
+        walk_instruction body pos 1;
+
+        (* the handler starts a new bb *)
+        insert_pos_for_live intervals handler pos;
+        insert_destroyed_at_raise intervals !pos;
+        walk_instruction handler pos 0;
+
+        (* nex starts a new bb *)
+        walk_instruction i.next pos 1
+    | Iraise ->
+        (* raise ends a bb *)
+        insert_pos_for_live intervals i pos;
+
+        walk_instruction i.next pos 1
+    end
+
+
+
+  in
+
+  let pos = ref 0 in
+  walk_instruction fundecl.fun_body pos 1;
+
+
+  interval_list := []; 
+  fixed_interval_list := []; 
+  Array.iter (fun i -> 
+        if i.iend != 0 then begin
+          i.ranges <- List.rev i.ranges;
+          begin match i.reg.loc with
+          | Reg r -> fixed_interval_list := i :: !fixed_interval_list
+          | _ -> interval_list := i :: !interval_list
+          end
+        end) intervals;
+
+
+  interval_list := List.sort (fun i0 i1 -> i0.ibegin - i1.ibegin) !interval_list;
+
+  ()
Index: asmcomp/linscan.ml
===================================================================
--- asmcomp/linscan.ml	(Revision 0)
+++ asmcomp/linscan.ml	(Revision 27)
@@ -0,0 +1,295 @@
+(***********************************************************************)
+(*                                                                     *)
+(*                           Objective Caml                            *)
+(*                                                                     *)
+(*                          Marcell Fischbach                          *)
+(*                                                                     *)
+(*  Copyright 2011 University of Siegen. All rights reserved.          *)
+(*  This file is distributed  under the terms of the                   *)
+(*  Q Public License version 1.0.                                      *)
+(*                                                                     *)
+(***********************************************************************)
+
+
+open Interval
+open Clflags
+open List
+open Format
+open Mach
+
+
+type active_t =
+{
+    mutable active : interval list;
+    mutable inactive : interval list;
+    mutable fixed : interval list;
+}
+
+
+let active = Array.init Proc.num_register_classes (fun i -> {active = []; inactive = []; fixed= [] })
+
+let rec insert_into active current = 
+  begin match active with
+  | [] -> [current]
+  | interval::tl ->
+    (* check code for <= or < *)
+    if interval.iend <= current.iend then 
+      current :: active
+    else
+      interval :: insert_into tl current
+  end
+
+
+let rec release_expired_fixed pos intervals =
+  begin match intervals with
+  | [] -> []
+  | interval::tl ->
+      if interval.iend > pos then begin
+        interval.ranges <- Interval.strip_expired_ranges interval.ranges pos;
+        interval :: release_expired_fixed pos tl
+      end
+      else
+        []
+ end
+
+ 
+let rec release_expired_active active_cl pos intervals =
+  begin match intervals with
+  | [] -> []
+  | interval::tl ->
+      if interval.iend > pos then begin
+        interval.ranges <- Interval.strip_expired_ranges interval.ranges pos;
+        if Interval.live_on interval pos then
+          interval :: release_expired_active active_cl pos tl
+        else begin
+          active_cl.inactive <- insert_into active_cl.inactive interval;
+          release_expired_active active_cl pos tl
+        end
+      end
+      else
+        []
+ end
+
+let rec release_expired_inactive active_cl pos intervals =
+  begin match intervals with
+  | [] -> []
+  | interval::tl ->
+      if interval.iend > pos then begin
+        interval.ranges <- Interval.strip_expired_ranges interval.ranges pos;
+        if not (Interval.live_on interval pos) then 
+          interval :: release_expired_inactive active_cl pos tl
+        else begin
+          active_cl.active <- insert_into active_cl.active interval;
+          release_expired_inactive active_cl pos tl
+        end
+      end
+      else
+        []
+ end
+
+
+   
+
+let get_stack_slot cl = 
+  let nslots = Proc.num_stack_slots.(cl) in
+  Proc.num_stack_slots.(cl) <- nslots + 1;
+  nslots
+    
+
+
+let pop_active active = 
+  begin match active with
+  | [] -> []
+  | _::tl -> tl
+  end
+  
+
+(* find a register for the given interval and assigns this
+   register. The interval is inserted into active.
+   If there is no space available for this interval then
+   nothings happens and false is returned. Otherwise 
+   returns true.
+   *)
+let try_alloc_free_register interval  =
+  let cl = Proc.register_class interval.reg in
+  (* this intervals has already been spilled *)
+  if interval.reg.Reg.spill then begin
+    begin match interval.reg.Reg.loc with
+    | Reg.Unknown -> interval.reg.Reg.loc <- Reg.Stack(Reg.Local (get_stack_slot cl));
+    | _ -> ()
+    end
+  end;
+  
+  let num = Proc.num_available_registers.(cl) in
+  if interval.reg.Reg.loc != Reg.Unknown then true (* this register is already allocated or spilled *)
+  else if num = 0 then false (* there are not registers for this class *)
+  else begin
+    let first_reg = Proc.first_available_register.(cl) in
+    let active_cl = active.(cl) in
+
+    (* create array containing all possible free regs *)
+    let regs = Array.make num true in
+    
+    (* remove all assigned registers from the free array *)
+    let rec remove_bound actives =
+      begin match actives with
+      | [] -> ()
+      | i::tl ->
+        begin
+          begin match i.reg.Reg.loc with
+          | Reg.Reg(r) -> regs.(r - first_reg) <- false
+          | _ -> ()
+          end;
+          remove_bound tl
+        end
+      end
+    in
+
+    remove_bound active_cl.active;
+      
+    (* remove all overlapping registers from the free array *)
+    let rec remove_bound_overlapping fix =
+      begin match fix with
+      | [] -> ()
+      | i::tl ->
+        begin
+          begin match i.reg.Reg.loc with
+          | Reg.Reg(r) -> 
+              if regs.(r-first_reg) && Interval.overlapping i interval then 
+                regs.(r - first_reg) <- false
+          | _ -> ()
+          end;
+          remove_bound_overlapping tl
+        end
+      end
+    in
+    remove_bound_overlapping active_cl.inactive;
+    remove_bound_overlapping active_cl.fixed;
+      
+
+    let rec find_first_free_reg c =
+      if c = num then -1
+      else if regs.(c) then c
+      else find_first_free_reg (c+1) in
+
+    let first_free_reg = find_first_free_reg 0 in
+    
+    if first_free_reg = -1 then false
+    else begin
+      (* assign the free register *)
+      interval.reg.Reg.loc <- Reg.Reg (first_reg + first_free_reg);
+      interval.reg.Reg.spill <- false;
+      (* and insert the current interval into active *)
+      active_cl.active <- insert_into active_cl.active interval;
+      true
+    end;
+  end
+ 
+
+let allocate_blocked_register interval =
+  let cl = Proc.register_class interval.reg in
+  let active_cl = active.(cl) in
+  
+  
+  if active_cl.active = [] then begin
+    (* this is the special case when there are no register at all
+       in the register class. This can happen e.g. for float Regs on i386 *)
+    interval.reg.Reg.loc <- Reg.Stack(Reg.Local (get_stack_slot cl));
+    interval.reg.Reg.spill <- true
+  end
+  else begin
+    
+    (* get the latest interval in active *)
+    let last_active = List.hd active_cl.active in
+    
+    if last_active.iend > interval.iend then begin
+      (* last interval in active ends latest -> spill it*)
+      
+      (* transfer the register from the active in the current interval *)
+      begin match last_active.reg.Reg.loc with 
+      | Reg.Reg r -> interval.reg.Reg.loc <- Reg.Reg r
+      | _ -> ()
+      end;
+      
+      (* remove the latest interval from active ... *)
+      active_cl.active <- pop_active active_cl.active;
+      (* ... and insert the current *)
+      active_cl.active <- insert_into active_cl.active interval;
+      
+      (* now get a new stack slot for the spilled register *)
+      last_active.reg.Reg.loc <- Reg.Stack(Reg.Local (get_stack_slot cl));
+      last_active.reg.Reg.spill <- true
+    end
+    else begin
+      (* the current interval ends latest -> spill it *)
+      interval.reg.Reg.loc <- Reg.Stack(Reg.Local (get_stack_slot cl));
+      interval.reg.Reg.spill <- true
+    end;
+  end;
+  ()
+
+
+let handle_interval interval =
+  let position = interval.ibegin in
+  
+  (* release all intervals that have been expired at the current step*)
+  for i = 0 to Proc.num_register_classes - 1 do
+    let active_cl = active.(i) in
+    active_cl.active    <- release_expired_active active_cl position active_cl.active;
+    active_cl.inactive  <- release_expired_inactive active_cl position active_cl.inactive;
+    active_cl.fixed     <- release_expired_fixed position active_cl.fixed;
+  done;
+  
+  
+  (* find a register for allocation *)
+  if not (try_alloc_free_register interval) then 
+    (* a valid free register could not be found, so we have to
+       decide which interval is to be spilled *)
+    allocate_blocked_register interval
+
+(* create active liste for every register class *)
+let initialize_interval_lists intervals =
+
+  
+  for i=0 to Proc.num_register_classes - 1 do 
+    let active_cl = active.(i) in
+    (* start with empty actives *)
+    active_cl.active    <- [];
+    active_cl.inactive  <- [];
+    active_cl.fixed     <- [];
+  done;
+
+  (* add all fixed intervals to the list of active_fixed intervals *)
+  let rec add_fixed_intervals intervals =
+    begin match intervals with
+    | [] -> ()
+    | i :: tl -> 
+      let active_cl = active.(Proc.register_class i.reg) in
+      active_cl.fixed <- i :: active_cl.fixed;
+      add_fixed_intervals tl
+    end in
+  add_fixed_intervals intervals;
+
+  for i = 0 to Proc.num_register_classes - 1 do
+    let active_cl = active.(i) in
+    active_cl.fixed <- List.sort (fun i0 i1 -> i1.iend - i0.iend) active_cl.fixed
+  done
+
+
+ 
+  
+
+let walk_intervals intervals fixed_intervals fd =
+  (* Initialize the stack slots *)
+  for i = 0 to Proc.num_register_classes - 1 do
+    Proc.num_stack_slots.(i) <- 0
+  done;
+  
+  
+  (* create the active lists *)
+  initialize_interval_lists fixed_intervals;
+  
+
+  (* Walk all the intervals within the list *)
+  List.iter handle_interval intervals
+
Index: asmcomp/linscan.mli
===================================================================
--- asmcomp/linscan.mli	(Revision 0)
+++ asmcomp/linscan.mli	(Revision 27)
@@ -0,0 +1,16 @@
+(***********************************************************************)
+(*                                                                     *)
+(*                           Objective Caml                            *)
+(*                                                                     *)
+(*                          Marcell Fischbach                          *)
+(*                                                                     *)
+(*  Copyright 2011 University of Siegen. All rights reserved.          *)
+(*  This file is distributed  under the terms of the                   *)
+(*  Q Public License version 1.0.                                      *)
+(*                                                                     *)
+(***********************************************************************)
+
+
+
+val walk_intervals: Interval.interval list -> Interval.interval list -> Mach.fundecl -> unit
+
Index: Makefile
===================================================================
--- Makefile	(Revision 2)
+++ Makefile	(Revision 27)
@@ -79,6 +79,7 @@
   asmcomp/interf.cmo asmcomp/coloring.cmo \
   asmcomp/reloadgen.cmo asmcomp/reload.cmo \
   asmcomp/printlinear.cmo asmcomp/linearize.cmo \
+  asmcomp/interval.cmo asmcomp/linscan.cmo \
   asmcomp/schedgen.cmo asmcomp/scheduling.cmo \
   asmcomp/emitaux.cmo asmcomp/emit.cmo asmcomp/asmgen.cmo \
   asmcomp/asmlink.cmo asmcomp/asmlibrarian.cmo asmcomp/asmpackager.cmo
Index: toplevel/opttopmain.ml
===================================================================
--- toplevel/opttopmain.ml	(Revision 2)
+++ toplevel/opttopmain.ml	(Revision 27)
@@ -85,6 +85,8 @@
   let _warn_error s = Warnings.parse_options true s
   let _warn_help = Warnings.help_warnings
 
+  let _linscan = set use_linscan
+
   let _dparsetree = set dump_parsetree
   let _drawlambda = set dump_rawlambda
   let _dlambda = set dump_lambda
@@ -100,6 +102,7 @@
   let _dreload = set dump_reload
   let _dscheduling = set dump_scheduling
   let _dlinear = set dump_linear
+  let _dinterval = set dump_interval
   let _dstartup = set keep_startup_file
 
   let anonymous = file_argument
Index: Makefile.nt
===================================================================
--- Makefile.nt	(Revision 2)
+++ Makefile.nt	(Revision 27)
@@ -76,6 +76,7 @@
   asmcomp/interf.cmo asmcomp/coloring.cmo \
   asmcomp/reloadgen.cmo asmcomp/reload.cmo \
   asmcomp/printlinear.cmo asmcomp/linearize.cmo \
+  asmcomp/interval.cmo asmcomp/linscan.cmo \
   asmcomp/schedgen.cmo asmcomp/scheduling.cmo \
   asmcomp/emitaux.cmo asmcomp/emit.cmo asmcomp/asmgen.cmo \
   asmcomp/asmlink.cmo asmcomp/asmlibrarian.cmo asmcomp/asmpackager.cmo
Index: utils/clflags.ml
===================================================================
--- utils/clflags.ml	(Revision 2)
+++ utils/clflags.ml	(Revision 27)
@@ -26,6 +26,7 @@
 and make_archive = ref false            (* -a *)
 and debug = ref false                   (* -g *)
 and fast = ref false                    (* -unsafe *)
+and use_linscan = ref false             (* -linscan *)
 and link_everything = ref false         (* -linkall *)
 and custom_runtime = ref false          (* -custom *)
 and output_c_object = ref false         (* -output-obj *)
@@ -73,6 +74,7 @@
 let dump_reload = ref false             (* -dreload *)
 let dump_scheduling = ref false         (* -dscheduling *)
 let dump_linear = ref false             (* -dlinear *)
+let dump_interval = ref false           (* -dinterval *)
 let keep_startup_file = ref false       (* -dstartup *)
 let dump_combine = ref false            (* -dcombine *)
 
Index: utils/clflags.mli
===================================================================
--- utils/clflags.mli	(Revision 2)
+++ utils/clflags.mli	(Revision 27)
@@ -23,6 +23,7 @@
 val make_archive : bool ref
 val debug : bool ref
 val fast : bool ref
+val use_linscan : bool ref
 val link_everything : bool ref
 val custom_runtime : bool ref
 val output_c_object : bool ref
@@ -67,6 +68,7 @@
 val dump_reload : bool ref
 val dump_scheduling : bool ref
 val dump_linear : bool ref
+val dump_interval : bool ref
 val keep_startup_file : bool ref
 val dump_combine : bool ref
 val native_code : bool ref

^ permalink raw reply	[flat|nested] 20+ messages in thread

* [Caml-list] Re: Linear Scan Register Allocator for ocamlopt/ocamlnat
  2011-08-01 14:53 [Caml-list] Linear Scan Register Allocator for ocamlopt/ocamlnat Benedikt Meurer
@ 2011-08-01 15:03 ` Benedikt Meurer
  2011-08-01 15:04 ` [Caml-list] " Gabriel Scherer
  2011-08-01 15:29 ` Wojciech Meyer
  2 siblings, 0 replies; 20+ messages in thread
From: Benedikt Meurer @ 2011-08-01 15:03 UTC (permalink / raw)
  To: caml-list; +Cc: Marcell Fischbach


On Aug 1, 2011, at 16:53 , Benedikt Meurer wrote:

> The linear scan implementation reuses as much of the existing ocamlopt functionality as possible, so additional maintenance overhead should be manageable. Comments and suggestions are welcome of course. Please keep Marcell CC'ed with any replies as he's not subscribed to the list.

Mantis ticket created: http://caml.inria.fr/mantis/view.php?id=5324

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [Caml-list] Linear Scan Register Allocator for ocamlopt/ocamlnat
  2011-08-01 14:53 [Caml-list] Linear Scan Register Allocator for ocamlopt/ocamlnat Benedikt Meurer
  2011-08-01 15:03 ` [Caml-list] " Benedikt Meurer
@ 2011-08-01 15:04 ` Gabriel Scherer
  2011-08-01 16:57   ` Benedikt Meurer
  2011-08-24 19:35   ` Benedikt Meurer
  2011-08-01 15:29 ` Wojciech Meyer
  2 siblings, 2 replies; 20+ messages in thread
From: Gabriel Scherer @ 2011-08-01 15:04 UTC (permalink / raw)
  To: Benedikt Meurer; +Cc: caml-list, Marcell Fischbach

This work is meant to make a compromise between generated code quality
and compilation speed to have good performances in rapid
prototyping/development scenario.

Do you have more precise measurements on
- the relative costs of the successive transformations during native
compilation (including external linking etc.)? Which proportion of
time is currently used for register allocation?
- the performance cost of this new allocator in the generated code? I
suppose the results may vary between different architectures (eg. x86
is probably more sensitive to good allocation decisions than x86_64).

On Mon, Aug 1, 2011 at 4:53 PM, Benedikt Meurer
<benedikt.meurer@googlemail.com> wrote:
> Hello,
>
> As mentioned earlier we have a student working on an implementation of the Linear Scan Register Allocator [1] for ocamlopt (and thereby ocamlnat). It took some time, but now there's a first working patch which looks promising. This work is done by Marcell Fischbach as part of his diploma thesis. The idea is to use the linear scan algorithm to drive the register allocation in the native top-level ocamlnat at some point, as suggested by Fabrice Le Fessant [2].
>
> Marcell is now working to implement a proof-of-concept of an inline assembler for ocamlnat on i386 based on code from Alain Frisch an Fabrice Le Fessant. The result will also be contributed once ready, and will be used to effectively compare ocamlnat and the byte-code ocaml top-level.
>
> The linear scan implementation reuses as much of the existing ocamlopt functionality as possible, so additional maintenance overhead should be manageable. Comments and suggestions are welcome of course. Please keep Marcell CC'ed with any replies as he's not subscribed to the list.
>
> greets,
> Benedikt
>
>
> [1] http://portal.acm.org/citation.cfm?id=330250
> [2] http://caml.inria.fr/pub/ml-archives/caml-list/2010/11/a1b0ed0934ff51df4ac07c5e9da6e9d6.en.html
>
>
> --
> Caml-list mailing list.  Subscription management and archives:
> https://sympa-roc.inria.fr/wws/info/caml-list
> Beginner's list: http://groups.yahoo.com/group/ocaml_beginners
> Bug reports: http://caml.inria.fr/bin/caml-bugs
>
>


^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [Caml-list] Linear Scan Register Allocator for ocamlopt/ocamlnat
  2011-08-01 14:53 [Caml-list] Linear Scan Register Allocator for ocamlopt/ocamlnat Benedikt Meurer
  2011-08-01 15:03 ` [Caml-list] " Benedikt Meurer
  2011-08-01 15:04 ` [Caml-list] " Gabriel Scherer
@ 2011-08-01 15:29 ` Wojciech Meyer
  2011-08-01 17:03   ` Benedikt Meurer
  2 siblings, 1 reply; 20+ messages in thread
From: Wojciech Meyer @ 2011-08-01 15:29 UTC (permalink / raw)
  To: Benedikt Meurer; +Cc: caml-list, Marcell Fischbach

[-- Attachment #1: Type: text/plain, Size: 2639 bytes --]

On Mon, Aug 1, 2011 at 3:53 PM, Benedikt Meurer <
benedikt.meurer@googlemail.com> wrote:

> Hello,
>
> As mentioned earlier we have a student working on an implementation of the
> Linear Scan Register Allocator [1] for ocamlopt (and thereby ocamlnat). It
> took some time, but now there's a first working patch which looks promising.
> This work is done by Marcell Fischbach as part of his diploma thesis. The
> idea is to use the linear scan algorithm to drive the register allocation in
> the native top-level ocamlnat at some point, as suggested by Fabrice Le
> Fessant [2].
>
> Marcell is now working to implement a proof-of-concept of an inline
> assembler for ocamlnat on i386 based on code from Alain Frisch an Fabrice Le
> Fessant. The result will also be contributed once ready, and will be used to
> effectively compare ocamlnat and the byte-code ocaml top-level.
>
> The linear scan implementation reuses as much of the existing ocamlopt
> functionality as possible, so additional maintenance overhead should be
> manageable. Comments and suggestions are welcome of course. Please keep
> Marcell CC'ed with any replies as he's not subscribed to the list.
>

Hello Benedikt,

It's also worth to note that there is some generic mid/back-end code ready,
in my OCaml native compiler framework called DragonKit in a spirit of LLVM,
which I am currently actively working on. It's already able to express a toy
language and JIT compile and run it within the same process on top of x86
architecture:

The code includes:
- SSA based IL using polymorphic variants
- monadic code generator
- plugable passes using first class modules
- very ad-hoc register allocator
- X86 backend
- example that evolved from LLVM kaleidoscope ported to DragonKit to
Pascal/ML like language, and another example using direct translation to X86
backend.

The code is functorised and almost purely functional (with few exceptions
where there is no benefit of doing that).

Maybe it could have some use in your new toplevel, or maybe it would be
worth to reuse some of your work in DragonKit (or viceversa).

I welcome anybody wanting to join me in this effort.

Please take a look at:

www.github.com/danmey/DragonKit

Cheers;
Wojciech


greets,

> Benedikt
>
>
> [1] http://portal.acm.org/citation.cfm?id=330250
> [2]
> http://caml.inria.fr/pub/ml-archives/caml-list/2010/11/a1b0ed0934ff51df4ac07c5e9da6e9d6.en.html
>
>
> --
> Caml-list mailing list.  Subscription management and archives:
> https://sympa-roc.inria.fr/wws/info/caml-list
> Beginner's list: http://groups.yahoo.com/group/ocaml_beginners
> Bug reports: http://caml.inria.fr/bin/caml-bugs
>
>

[-- Attachment #2: Type: text/html, Size: 3720 bytes --]

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [Caml-list] Linear Scan Register Allocator for ocamlopt/ocamlnat
  2011-08-01 15:04 ` [Caml-list] " Gabriel Scherer
@ 2011-08-01 16:57   ` Benedikt Meurer
  2011-08-24 19:35   ` Benedikt Meurer
  1 sibling, 0 replies; 20+ messages in thread
From: Benedikt Meurer @ 2011-08-01 16:57 UTC (permalink / raw)
  To: Gabriel Scherer; +Cc: caml-list, Marcell Fischbach

On Aug 1, 2011, at 17:04 , Gabriel Scherer wrote:

> This work is meant to make a compromise between generated code quality
> and compilation speed to have good performances in rapid
> prototyping/development scenario.
> 
> Do you have more precise measurements on
> - the relative costs of the successive transformations during native
> compilation (including external linking etc.)? Which proportion of
> time is currently used for register allocation?

Right now most of the time in ocamlnat is spent in waiting for the assembler and linker to finish and the runtime linker to load the generated library file. However there are no precise timing results yet.

Marcell did a rough test with ocamlopt last week, building the test suite with both graph coloring and linear scan on a 2009 iMac (Core 2 Duo). The overall time spent in the register allocator dropped from 28s to 8s.

> - the performance cost of this new allocator in the generated code? I
> suppose the results may vary between different architectures (eg. x86
> is probably more sensitive to good allocation decisions than x86_64).

Same here... not yet. From what I've seen, the generated amd64 code is really close to the graph coloring code (isomorphic up to register renaming in most cases). Dunno for i386 yet.

Benedikt

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [Caml-list] Linear Scan Register Allocator for ocamlopt/ocamlnat
  2011-08-01 15:29 ` Wojciech Meyer
@ 2011-08-01 17:03   ` Benedikt Meurer
  0 siblings, 0 replies; 20+ messages in thread
From: Benedikt Meurer @ 2011-08-01 17:03 UTC (permalink / raw)
  To: Wojciech Meyer; +Cc: caml-list, Marcell Fischbach


On Aug 1, 2011, at 17:29 , Wojciech Meyer wrote:

> Hello Benedikt,

Hello Wojciech,

> It's also worth to note that there is some generic mid/back-end code ready, in my OCaml native compiler framework called DragonKit in a spirit of LLVM, which I am currently actively working on. It's already able to express a toy language and JIT compile and run it within the same process on top of x86 architecture:
> 
> The code includes:
> - SSA based IL using polymorphic variants
> - monadic code generator
> - plugable passes using first class modules
> - very ad-hoc register allocator
> - X86 backend
> - example that evolved from LLVM kaleidoscope ported to DragonKit to Pascal/ML like language, and another example using direct translation to X86 backend.
> 
> The code is functorised and almost purely functional (with few exceptions where there is no benefit of doing that).
> 
> Maybe it could have some use in your new toplevel, or maybe it would be worth to reuse some of your work in DragonKit (or viceversa).
> 
> I welcome anybody wanting to join me in this effort.
> 
> Please take a look at:
> 
> www.github.com/danmey/DragonKit

I doubt that this would be applicable in the context of ocamlopt. What we are heading for is simply a way to avoid the external as and ld invocations by doing the work in-process. Code generation is done already, we just replace the last phase (see emit.ml).

BTW: On a related note, we also have a student working on a LLVM backend for ocamlopt as part of his bachelor thesis, which may be related to what you do.

> Cheers;
> Wojciech

greets,
Benedikt



^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [Caml-list] Linear Scan Register Allocator for ocamlopt/ocamlnat
  2011-08-01 15:04 ` [Caml-list] " Gabriel Scherer
  2011-08-01 16:57   ` Benedikt Meurer
@ 2011-08-24 19:35   ` Benedikt Meurer
  2011-08-24 20:40     ` Gerd Stolpmann
                       ` (2 more replies)
  1 sibling, 3 replies; 20+ messages in thread
From: Benedikt Meurer @ 2011-08-24 19:35 UTC (permalink / raw)
  To: caml-list


On Aug 1, 2011, at 17:04 , Gabriel Scherer wrote:

> Do you have more precise measurements on

Also posting Marcell's timing results here for reference (taken from bug 5324).

> - the performance cost of this new allocator in the generated code? I
> suppose the results may vary between different architectures (eg. x86
> is probably more sensitive to good allocation decisions than x86_64).

- http://ps.informatik.uni-siegen.de/~meurer/tmp/compiletime_timings.pdf contains a comparison of the ocamlopt invocations.
- http://ps.informatik.uni-siegen.de/~meurer/tmp/runtime_timings.pdf contains comparison of the generated code.

As can be seen from the results, amd64 is more sensitive to register allocator changes than i386. Not really surprising to me. Would be interesting to see how this affects PPC/Sparc/Mips, but we don't have appropriate hardware available right now. Anyone with appropriate hardware and some spare time? :-)

Benedikt


^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [Caml-list] Linear Scan Register Allocator for ocamlopt/ocamlnat
  2011-08-24 19:35   ` Benedikt Meurer
@ 2011-08-24 20:40     ` Gerd Stolpmann
  2011-08-25  8:02       ` Benedikt Meurer
  2011-08-24 20:47     ` malc
  2011-08-26 12:40     ` Roberto Di Cosmo
  2 siblings, 1 reply; 20+ messages in thread
From: Gerd Stolpmann @ 2011-08-24 20:40 UTC (permalink / raw)
  To: Benedikt Meurer; +Cc: caml-list

Am Mittwoch, den 24.08.2011, 21:35 +0200 schrieb Benedikt Meurer:
> On Aug 1, 2011, at 17:04 , Gabriel Scherer wrote:
> 
> > Do you have more precise measurements on
> 
> Also posting Marcell's timing results here for reference (taken from bug 5324).
> 
> > - the performance cost of this new allocator in the generated code? I
> > suppose the results may vary between different architectures (eg. x86
> > is probably more sensitive to good allocation decisions than x86_64).
> 
> - http://ps.informatik.uni-siegen.de/~meurer/tmp/compiletime_timings.pdf contains a comparison of the ocamlopt invocations.
> - http://ps.informatik.uni-siegen.de/~meurer/tmp/runtime_timings.pdf contains comparison of the generated code.
> 
> As can be seen from the results, amd64 is more sensitive to register allocator changes than i386. 

Well, this particular i386 CPU model is a strange guy - Northwoods have
this extremely long pipeline, which is very sensitive to unforeseen
jumps. It would be more interesting to see this test on a modern CPU in
i386 mode. My guess is that it behaves then more like amd64.

Gerd

> Not really surprising to me. Would be interesting to see how this affects PPC/Sparc/Mips, but we don't have appropriate hardware available right now. Anyone with appropriate hardware and some spare time? :-)
> 
> Benedikt
> 
> 

-- 
------------------------------------------------------------
Gerd Stolpmann, Darmstadt, Germany    gerd@gerd-stolpmann.de
Creator of GODI and camlcity.org.
Contact details:        http://www.camlcity.org/contact.html
Company homepage:       http://www.gerd-stolpmann.de
*** Searching for new projects! Need consulting for system
*** programming in Ocaml? Gerd Stolpmann can help you.
------------------------------------------------------------


^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [Caml-list] Linear Scan Register Allocator for ocamlopt/ocamlnat
  2011-08-24 19:35   ` Benedikt Meurer
  2011-08-24 20:40     ` Gerd Stolpmann
@ 2011-08-24 20:47     ` malc
  2011-08-26 12:40     ` Roberto Di Cosmo
  2 siblings, 0 replies; 20+ messages in thread
From: malc @ 2011-08-24 20:47 UTC (permalink / raw)
  To: Benedikt Meurer; +Cc: caml-list

On Wed, 24 Aug 2011, Benedikt Meurer wrote:

> 
> On Aug 1, 2011, at 17:04 , Gabriel Scherer wrote:
> 
> > Do you have more precise measurements on
> 
> Also posting Marcell's timing results here for reference (taken from bug 
> 5324).
> 
> > - the performance cost of this new allocator in the generated code? I
> > suppose the results may vary between different architectures (eg. x86
> > is probably more sensitive to good allocation decisions than x86_64).
> 
> - http://ps.informatik.uni-siegen.de/~meurer/tmp/compiletime_timings.pdf contains a comparison of the ocamlopt invocations.
> - http://ps.informatik.uni-siegen.de/~meurer/tmp/runtime_timings.pdf contains comparison of the generated code.
> 
> As can be seen from the results, amd64 is more sensitive to register 
> allocator changes than i386. Not really surprising to me. Would be 
> interesting to see how this affects PPC/Sparc/Mips, but we don't have 
> appropriate hardware available right now. Anyone with appropriate 
> hardware and some spare time? :-)
> 

I can test things on PPC/PPC64(well... CBE).

-- 
mailto:av1474@comtv.ru

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [Caml-list] Linear Scan Register Allocator for ocamlopt/ocamlnat
  2011-08-24 20:40     ` Gerd Stolpmann
@ 2011-08-25  8:02       ` Benedikt Meurer
  2011-08-25  9:34         ` Benedikt Meurer
  0 siblings, 1 reply; 20+ messages in thread
From: Benedikt Meurer @ 2011-08-25  8:02 UTC (permalink / raw)
  To: Gerd Stolpmann; +Cc: caml-list


On Aug 24, 2011, at 22:40 , Gerd Stolpmann wrote:

>>> - the performance cost of this new allocator in the generated code? I
>>> suppose the results may vary between different architectures (eg. x86
>>> is probably more sensitive to good allocation decisions than x86_64).
>> 
>> - http://ps.informatik.uni-siegen.de/~meurer/tmp/compiletime_timings.pdf contains a comparison of the ocamlopt invocations.
>> - http://ps.informatik.uni-siegen.de/~meurer/tmp/runtime_timings.pdf contains comparison of the generated code.
>> 
>> As can be seen from the results, amd64 is more sensitive to register allocator changes than i386. 
> 
> Well, this particular i386 CPU model is a strange guy - Northwoods have
> this extremely long pipeline, which is very sensitive to unforeseen
> jumps. It would be more interesting to see this test on a modern CPU in
> i386 mode. My guess is that it behaves then more like amd64.

Modern CPUs most probably don't run ocaml in 32bit mode, but more likely in long mode. That's why we choose to run the i386 on "real 32bit hardware", where the ocaml i386 port is actually used.

I'll rerun the benchmark in 32bit mode on a modern cpu to see how things change.

> Gerd

Benedikt

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [Caml-list] Linear Scan Register Allocator for ocamlopt/ocamlnat
  2011-08-25  8:02       ` Benedikt Meurer
@ 2011-08-25  9:34         ` Benedikt Meurer
  2011-08-25 10:21           ` Gerd Stolpmann
  0 siblings, 1 reply; 20+ messages in thread
From: Benedikt Meurer @ 2011-08-25  9:34 UTC (permalink / raw)
  To: caml-list; +Cc: Gerd Stolpmann, Fischbach Marcell


On Aug 25, 2011, at 10:02 , Benedikt Meurer wrote:

>>> - http://ps.informatik.uni-siegen.de/~meurer/tmp/compiletime_timings.pdf contains a comparison of the ocamlopt invocations.
>>> - http://ps.informatik.uni-siegen.de/~meurer/tmp/runtime_timings.pdf contains comparison of the generated code.
>>> 
>>> As can be seen from the results, amd64 is more sensitive to register allocator changes than i386. 
>> 
>> Well, this particular i386 CPU model is a strange guy - Northwoods have
>> this extremely long pipeline, which is very sensitive to unforeseen
>> jumps. It would be more interesting to see this test on a modern CPU in
>> i386 mode. My guess is that it behaves then more like amd64.
> 
> Modern CPUs most probably don't run ocaml in 32bit mode, but more likely in long mode. That's why we choose to run the i386 on "real 32bit hardware", where the ocaml i386 port is actually used.
> 
> I'll rerun the benchmark in 32bit mode on a modern cpu to see how things change.

Reran the benchmark with 32bit ocaml on the MBP (Early 2011), results are available at:

http://ps.informatik.uni-siegen.de/~meurer/tmp/linscan-i7-i386-timings.pdf

Benedikt



^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [Caml-list] Linear Scan Register Allocator for ocamlopt/ocamlnat
  2011-08-25  9:34         ` Benedikt Meurer
@ 2011-08-25 10:21           ` Gerd Stolpmann
  2011-08-25 10:25             ` Pierre-Alexandre Voye
  2011-08-25 10:43             ` Benedikt Meurer
  0 siblings, 2 replies; 20+ messages in thread
From: Gerd Stolpmann @ 2011-08-25 10:21 UTC (permalink / raw)
  To: Benedikt Meurer; +Cc: caml-list, Fischbach Marcell

Am Donnerstag, den 25.08.2011, 11:34 +0200 schrieb Benedikt Meurer:
> On Aug 25, 2011, at 10:02 , Benedikt Meurer wrote:
> 
> >>> - http://ps.informatik.uni-siegen.de/~meurer/tmp/compiletime_timings.pdf contains a comparison of the ocamlopt invocations.
> >>> - http://ps.informatik.uni-siegen.de/~meurer/tmp/runtime_timings.pdf contains comparison of the generated code.
> >>> 
> >>> As can be seen from the results, amd64 is more sensitive to register allocator changes than i386. 
> >> 
> >> Well, this particular i386 CPU model is a strange guy - Northwoods have
> >> this extremely long pipeline, which is very sensitive to unforeseen
> >> jumps. It would be more interesting to see this test on a modern CPU in
> >> i386 mode. My guess is that it behaves then more like amd64.
> > 
> > Modern CPUs most probably don't run ocaml in 32bit mode, but more likely in long mode. That's why we choose to run the i386 on "real 32bit hardware", where the ocaml i386 port is actually used.

Right. However, if you look at new 32-bit-only CPUs like older Atoms
these base on modern cores where only some of the circuits were omitted
that are needed for 64-bit execution.
> > 
> > I'll rerun the benchmark in 32bit mode on a modern cpu to see how things change.
> 
> Reran the benchmark with 32bit ocaml on the MBP (Early 2011), results are available at:
> 
> http://ps.informatik.uni-siegen.de/~meurer/tmp/linscan-i7-i386-timings.pdf

Hard to interpret. I have the impression that the compile times for
graph coloring on i386 are lower than on amd64 - has maybe to do with
the number of registers - but those for linear scanning are about the
same. (I'm comparing here with the amd64 numbers taken on the same
machine.)

The graphs for the runtimes are on a logarithmic scale. Do you also have
raw numbers? 

Gerd

> 
> Benedikt
> 
> 
> 

-- 
------------------------------------------------------------
Gerd Stolpmann, Darmstadt, Germany    gerd@gerd-stolpmann.de
Creator of GODI and camlcity.org.
Contact details:        http://www.camlcity.org/contact.html
Company homepage:       http://www.gerd-stolpmann.de
*** Searching for new projects! Need consulting for system
*** programming in Ocaml? Gerd Stolpmann can help you.
------------------------------------------------------------


^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [Caml-list] Linear Scan Register Allocator for ocamlopt/ocamlnat
  2011-08-25 10:21           ` Gerd Stolpmann
@ 2011-08-25 10:25             ` Pierre-Alexandre Voye
  2011-08-26 10:58               ` Guillaume Yziquel
  2011-08-26 12:29               ` Erik de Castro Lopo
  2011-08-25 10:43             ` Benedikt Meurer
  1 sibling, 2 replies; 20+ messages in thread
From: Pierre-Alexandre Voye @ 2011-08-25 10:25 UTC (permalink / raw)
  To: Gerd Stolpmann; +Cc: Benedikt Meurer, caml-list, Fischbach Marcell

[-- Attachment #1: Type: text/plain, Size: 1701 bytes --]

I have a stupid question : I wonder if it would not be a bad idea that Ocaml
output C code and let gcc do its work, so compile code with good
performances in a lot of architecture ? Gcc is able to do autovectorization
(SSE, MMX, Larabee in the futur, etc...), very specific processor
optimization, etc...
But maybe it's a stupid idea ?

2011/8/25 Gerd Stolpmann <info@gerd-stolpmann.de>

> Am Donnerstag, den 25.08.2011, 11:34 +0200 schrieb Benedikt Meurer:
> > On Aug 25, 2011, at 10:02 , Benedikt Meurer wrote:
> >
> > >>> -
> http://ps.informatik.uni-siegen.de/~meurer/tmp/compiletime_timings.pdfcontains a comparison of the ocamlopt invocations.
> > >>> - http://ps.informatik.uni-siegen.de/~meurer/tmp/runtime_timings.pdfcontains comparison of the generated code.
> > >>>
> > >>> As can be seen from the results, amd64 is more sensitive to register
> allocator changes than i386.
> > >>
> > >> Well, this particular i386 CPU model is a strange guy - Northwoods
> have
> > >> this extremely long pipeline, which is very sensitive to unforeseen
> > >> jumps. It would be more interesting to see this test on a modern CPU
> in
> > >> i386 mode. My guess is that it behaves then more like amd64.
> > >
> > > Modern CPUs most probably don't run ocaml in 32bit mode, but more
> likely in long mode. That's why we choose to run the i386 on "real 32bit
> hardware", where the ocaml i386 port is actually used.
>
> Right. However, if you look at new 32-bit-only CPUs like older Atoms
> these base on modern cores where only some of the circuits were omitted
> that are needed for 64-bit execution.--



> [cut]
>
---------------------
https://twitter.com/#!/ontologiae/
http://linuxfr.org/users/montaigne

[-- Attachment #2: Type: text/html, Size: 2655 bytes --]

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [Caml-list] Linear Scan Register Allocator for ocamlopt/ocamlnat
  2011-08-25 10:21           ` Gerd Stolpmann
  2011-08-25 10:25             ` Pierre-Alexandre Voye
@ 2011-08-25 10:43             ` Benedikt Meurer
  1 sibling, 0 replies; 20+ messages in thread
From: Benedikt Meurer @ 2011-08-25 10:43 UTC (permalink / raw)
  To: Gerd Stolpmann; +Cc: caml-list, Fischbach Marcell


On Aug 25, 2011, at 12:21 , Gerd Stolpmann wrote:

>> Reran the benchmark with 32bit ocaml on the MBP (Early 2011), results are available at:
>> 
>> http://ps.informatik.uni-siegen.de/~meurer/tmp/linscan-i7-i386-timings.pdf
> 
> Hard to interpret. I have the impression that the compile times for
> graph coloring on i386 are lower than on amd64 - has maybe to do with
> the number of registers - but those for linear scanning are about the
> same. (I'm comparing here with the amd64 numbers taken on the same
> machine.)
> 
> The graphs for the runtimes are on a logarithmic scale. Do you also have
> raw numbers? 

Of course.

These are the raw numbers for the i7 (i386):

command                  opt_gc     opt_ls       opt_gc/ls
----------------------------------------------------------
binarytree 16             1.096      1.102           0.995
fasta 100                 0.008      0.008           1.000
fasta 250000              0.044      0.044           1.000
fasta 25000000            2.767      2.737           1.011
mandelbrot 1000           0.224      0.224           1.000
mandelbrot 4000           3.583      3.570           1.004
meteor 2098               0.578      0.586           0.986
nbody 3000000             1.142      1.168           0.978
spectral 3000             4.963      5.243           0.947
almabench                 4.665      4.687           0.995
almabench.unsafe          4.629      4.702           0.984
bdd                       0.279      0.285           0.979
boyer                     0.566      0.572           0.990
fft                       0.421      0.422           0.998
nucleic                   0.499      0.513           0.973
quicksort                 0.150      0.149           1.007
quicksort.unsafe          0.121      0.121           1.000
sorts                     1.889      1.947           0.970

And the raw numbers for the P4:

command                  opt_gc     opt_ls       opt_gc/ls
----------------------------------------------------------
binarytree 16             2.796      2.780           1.006
fasta 100                 0.024      0.024           1.000
fasta 250000              0.404      0.408           0.990
fasta 25000000           33.350     33.402           0.998
mandelbrot 1000           0.892      0.884           1.009
mandelbrot 4000          14.220     14.040           1.013
meteor 2098               1.300      1.312           0.991
nbody 3000000             4.524      4.840           0.935
spectral 3000            51.979     53.919           0.964
almabench                17.261     17.721           0.974
almabench.unsafe         17.317     18.441           0.939
bdd                       1.256      1.248           1.006
boyer                     1.668      1.672           0.998
fft                       3.064      3.080           0.995
nucleic                   2.188      2.268           0.965
quicksort                 0.340      0.340           1.000
quicksort.unsafe          0.272      0.272           1.000
sorts                     6.164      6.180           0.997

> Gerd

Benedikt

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [Caml-list] Linear Scan Register Allocator for ocamlopt/ocamlnat
  2011-08-25 10:25             ` Pierre-Alexandre Voye
@ 2011-08-26 10:58               ` Guillaume Yziquel
  2011-08-26 12:29               ` Erik de Castro Lopo
  1 sibling, 0 replies; 20+ messages in thread
From: Guillaume Yziquel @ 2011-08-26 10:58 UTC (permalink / raw)
  To: Pierre-Alexandre Voye
  Cc: Gerd Stolpmann, Benedikt Meurer, caml-list, Fischbach Marcell

Le Thursday 25 Aug 2011 à 12:25:49 (+0200), Pierre-Alexandre Voye a écrit :
>    I have a stupid question : I wonder if it would not be a bad idea that
>    Ocaml output C code and let gcc do its work, so compile code with good
>    performances in a lot of architecture ? Gcc is able to do
>    autovectorization (SSE, MMX, Larabee in the futur, etc...), very
>    specific processor optimization, etc...
>    But maybe it's a stupid idea ?

I do not think it's a stupid idea. I've been quite stunned by how the
Mercury compiler does it (using cpp macros and so-called virtual
registers in C code). It's likely a pain however, and most compilers
I've seen using the "compile to C" approach usually use Boehm's GC, so
it may be quite a pain to adapt the OCaml runtime to it. However, gcc
isn't the golden bullet for everything. It definitely sucks with the
SIMD instruction set.

-- 
     Guillaume Yziquel

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [Caml-list] Linear Scan Register Allocator for ocamlopt/ocamlnat
  2011-08-25 10:25             ` Pierre-Alexandre Voye
  2011-08-26 10:58               ` Guillaume Yziquel
@ 2011-08-26 12:29               ` Erik de Castro Lopo
       [not found]                 ` <CANnJ5GfmnOhk8mJor8PfrGoC1-5vZWOgUKNZMT9k69iu0TBCeA@mail.gmail.com>
  1 sibling, 1 reply; 20+ messages in thread
From: Erik de Castro Lopo @ 2011-08-26 12:29 UTC (permalink / raw)
  To: caml-list

Pierre-Alexandre Voye wrote:

> I have a stupid question : I wonder if it would not be a bad idea that Ocaml
> output C code and let gcc do its work, so compile code with good
> performances in a lot of architecture ? Gcc is able to do autovectorization
> (SSE, MMX, Larabee in the futur, etc...), very specific processor
> optimization, etc...
> But maybe it's a stupid idea ?

I have some experience in thie area. I work on the DDC compiler [0]
a compiler for a strict by default (optionally lazy) evaluation
dialect of Haskell.

When I joined the project the compiler had a working compile via C
backend, to which I added an LLVM backend [1].

Executables compiled via the LLVM backend (even without exploring
any of the LLVM optimisation passes) were faster than the same
executables compiled via C (gcc -O2). I suspect this is because
the generated C code was nothing like the C code people write and
the GCC is only good at optimising idiomatic C code.

I highly recommend LLVM as a compiler backend.

HTH,
Erik

[0] http://disciple.ouroborus.net/
[1] http://www.mega-nerd.com/erikd/Blog/CodeHacking/DDC/index.html

-- 
----------------------------------------------------------------------
Erik de Castro Lopo
http://www.mega-nerd.com/

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [Caml-list] Linear Scan Register Allocator for ocamlopt/ocamlnat
  2011-08-24 19:35   ` Benedikt Meurer
  2011-08-24 20:40     ` Gerd Stolpmann
  2011-08-24 20:47     ` malc
@ 2011-08-26 12:40     ` Roberto Di Cosmo
  2 siblings, 0 replies; 20+ messages in thread
From: Roberto Di Cosmo @ 2011-08-26 12:40 UTC (permalink / raw)
  To: Benedikt Meurer; +Cc: caml-list

Dear Benedikt,
	this is a typical case where the FSF compile farm may come
in handy.

You can find more information on the machines available for this
kind of tests here: http://gcc.gnu.org/wiki/CompileFarm

To get an account on the machines, just follow the instructions on 
"How to get involved" on the page, you should be up and running
in very little time

--Roberto

On Wed, Aug 24, 2011 at 09:35:56PM +0200, Benedikt Meurer wrote:
> 
> On Aug 1, 2011, at 17:04 , Gabriel Scherer wrote:
> 
> > Do you have more precise measurements on
> 
> Also posting Marcell's timing results here for reference (taken from bug 5324).
> 
> > - the performance cost of this new allocator in the generated code? I
> > suppose the results may vary between different architectures (eg. x86
> > is probably more sensitive to good allocation decisions than x86_64).
> 
> - http://ps.informatik.uni-siegen.de/~meurer/tmp/compiletime_timings.pdf contains a comparison of the ocamlopt invocations.
> - http://ps.informatik.uni-siegen.de/~meurer/tmp/runtime_timings.pdf contains comparison of the generated code.
> 
> As can be seen from the results, amd64 is more sensitive to register allocator changes than i386. Not really surprising to me. Would be interesting to see how this affects PPC/Sparc/Mips, but we don't have appropriate hardware available right now. Anyone with appropriate hardware and some spare time? :-)
> 
> Benedikt
> 
> 
> -- 
> Caml-list mailing list.  Subscription management and archives:
> https://sympa-roc.inria.fr/wws/info/caml-list
> Beginner's list: http://groups.yahoo.com/group/ocaml_beginners
> Bug reports: http://caml.inria.fr/bin/caml-bugs
> 

-- 
--Roberto Di Cosmo
 
------------------------------------------------------------------
Professeur               En delegation a l'INRIA
PPS                      E-mail: roberto@dicosmo.org
Universite Paris Diderot WWW  : http://www.dicosmo.org
Case 7014                Tel  : ++33-(0)1-57 27 92 20
5, Rue Thomas Mann       
F-75205 Paris Cedex 13   Identica: http://identi.ca/rdicosmo
FRANCE.                  Twitter: http://twitter.com/rdicosmo
------------------------------------------------------------------
Attachments:
   MIME accepted
   Word deprecated, http://www.rfc1149.net/documents/whynotword
------------------------------------------------------------------
Office location:
 
Bureau 6C08 (6th floor)
175, rue du Chevaleret, XIII
Metro Chevaleret, ligne 6
------------------------------------------------------------------                                                 

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [Caml-list] Linear Scan Register Allocator for ocamlopt/ocamlnat
       [not found]                   ` <CANnJ5Gf4AnKPDvrKiYx0joiPbQVOCdFWgPR24Y1ZYYL1Qv5=fg@mail.gmail.com>
@ 2011-08-26 18:33                     ` Pierre-Alexandre Voye
  2011-08-30 21:07                       ` Fermin Reig
  0 siblings, 1 reply; 20+ messages in thread
From: Pierre-Alexandre Voye @ 2011-08-26 18:33 UTC (permalink / raw)
  To: caml-list

[-- Attachment #1: Type: text/plain, Size: 1410 bytes --]

Note that the ocaml compiler has a flag -cmm which outputs C-- ast code.
F. Reig made a c-- ocaml backend during his thesis.  Including a GC.
Unhappily, sources code haven't been released.
But it proves it works.

Le 26 août 2011 14:30, "Erik de Castro Lopo" <mle+ocaml@mega-nerd.com> a
écrit :

Pierre-Alexandre Voye wrote:

> I have a stupid question : I wonder if it would not be a bad idea th...
I have some experience in thie area. I work on the DDC compiler [0]
a compiler for a strict by default (optionally lazy) evaluation
dialect of Haskell.

When I joined the project the compiler had a working compile via C
backend, to which I added an LLVM backend [1].

Executables compiled via the LLVM backend (even without exploring
any of the LLVM optimisation passes) were faster than the same
executables compiled via C (gcc -O2). I suspect this is because
the generated C code was nothing like the C code people write and
the GCC is only good at optimising idiomatic C code.

I highly recommend LLVM as a compiler backend.

HTH,
Erik

[0] http://disciple.ouroborus.net/
[1] http://www.mega-nerd.com/erikd/Blog/CodeHacking/DDC/index.html

--
----------------------------------------------------------------------
Erik de Castro Lopo
http://www.mega-nerd.com/

-- 
Caml-list mailing list. Subscription management and archives:
https://sympa-roc.inria.fr/wws/i...

[-- Attachment #2: Type: text/html, Size: 2017 bytes --]

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [Caml-list] Linear Scan Register Allocator for ocamlopt/ocamlnat
  2011-08-26 18:33                     ` Pierre-Alexandre Voye
@ 2011-08-30 21:07                       ` Fermin Reig
  2011-08-30 21:21                         ` Pierre-Alexandre Voye
  0 siblings, 1 reply; 20+ messages in thread
From: Fermin Reig @ 2011-08-30 21:07 UTC (permalink / raw)
  To: Pierre-Alexandre Voye; +Cc: caml-list

Pierre-Alexandre Voye wrote:
> Note that the ocaml compiler has a flag -cmm which outputs C-- ast code.
> F. Reig made a c-- ocaml backend during his thesis.  Including a GC.
> Unhappily, sources code haven't been released.
> But it proves it works.

If anyone is interested, the dissertation is available at 
http://theses.gla.ac.uk/686/

Fermin

> 
>> Le 26 août 2011 14:30, "Erik de Castro Lopo" <mle+ocaml@mega-nerd.com 
>> <mailto:mle%2Bocaml@mega-nerd.com>> a écrit :
>>
>> Pierre-Alexandre Voye wrote:
>>
>> > I have a stupid question : I wonder if it would not be a bad idea th...
>>
>> I have some experience in thie area. I work on the DDC compiler [0]
>> a compiler for a strict by default (optionally lazy) evaluation
>> dialect of Haskell.
>>
>> When I joined the project the compiler had a working compile via C
>> backend, to which I added an LLVM backend [1].
>>
>> Executables compiled via the LLVM backend (even without exploring
>> any of the LLVM optimisation passes) were faster than the same
>> executables compiled via C (gcc -O2). I suspect this is because
>> the generated C code was nothing like the C code people write and
>> the GCC is only good at optimising idiomatic C code.
>>
>> I highly recommend LLVM as a compiler backend.
>>
>> HTH,
>> Erik
>>
>> [0] http://disciple.ouroborus.net/
>> [1] http://www.mega-nerd.com/erikd/Blog/CodeHacking/DDC/index.html
>>
>> --
>> ----------------------------------------------------------------------
>> Erik de Castro Lopo
>> http://www.mega-nerd.com/
>>
>>
>> -- 
>> Caml-list mailing list. Subscription management and archives:
>> https://sympa-roc.inria.fr/wws/i...
>>


^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [Caml-list] Linear Scan Register Allocator for ocamlopt/ocamlnat
  2011-08-30 21:07                       ` Fermin Reig
@ 2011-08-30 21:21                         ` Pierre-Alexandre Voye
  0 siblings, 0 replies; 20+ messages in thread
From: Pierre-Alexandre Voye @ 2011-08-30 21:21 UTC (permalink / raw)
  To: Fermin Reig; +Cc: caml-list

[-- Attachment #1: Type: text/plain, Size: 2208 bytes --]

Hi Fermin,
Your thesis is very interessting, but your code is far more interesting !
Why don't you release it with an open source licence ?

2011/8/30 Fermin Reig <reig@acm.org>

> Pierre-Alexandre Voye wrote:
>
>> Note that the ocaml compiler has a flag -cmm which outputs C-- ast code.
>> F. Reig made a c-- ocaml backend during his thesis.  Including a GC.
>> Unhappily, sources code haven't been released.
>> But it proves it works.
>>
>
> If anyone is interested, the dissertation is available at
> http://theses.gla.ac.uk/686/
>
> Fermin
>
>
>>  Le 26 août 2011 14:30, "Erik de Castro Lopo" <mle+ocaml@mega-nerd.com<mailto:
>>> mle%2Bocaml@mega-nerd.**com <mle%252Bocaml@mega-nerd.com>>> a écrit :
>>>
>>>
>>> Pierre-Alexandre Voye wrote:
>>>
>>> > I have a stupid question : I wonder if it would not be a bad idea th...
>>>
>>> I have some experience in thie area. I work on the DDC compiler [0]
>>> a compiler for a strict by default (optionally lazy) evaluation
>>> dialect of Haskell.
>>>
>>> When I joined the project the compiler had a working compile via C
>>> backend, to which I added an LLVM backend [1].
>>>
>>> Executables compiled via the LLVM backend (even without exploring
>>> any of the LLVM optimisation passes) were faster than the same
>>> executables compiled via C (gcc -O2). I suspect this is because
>>> the generated C code was nothing like the C code people write and
>>> the GCC is only good at optimising idiomatic C code.
>>>
>>> I highly recommend LLVM as a compiler backend.
>>>
>>> HTH,
>>> Erik
>>>
>>> [0] http://disciple.ouroborus.net/
>>> [1] http://www.mega-nerd.com/**erikd/Blog/CodeHacking/DDC/**index.html<http://www.mega-nerd.com/erikd/Blog/CodeHacking/DDC/index.html>
>>>
>>> --
>>> ------------------------------**------------------------------**
>>> ----------
>>> Erik de Castro Lopo
>>> http://www.mega-nerd.com/
>>>
>>>
>>> --
>>> Caml-list mailing list. Subscription management and archives:
>>> https://sympa-roc.inria.fr/**wws/i. <https://sympa-roc.inria.fr/wws/i.>
>>> ..
>>>
>>>
>


-- 
---------------------
https://twitter.com/#!/ontologiae/
http://linuxfr.org/users/montaigne

[-- Attachment #2: Type: text/html, Size: 3396 bytes --]

^ permalink raw reply	[flat|nested] 20+ messages in thread

end of thread, other threads:[~2011-08-30 21:21 UTC | newest]

Thread overview: 20+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2011-08-01 14:53 [Caml-list] Linear Scan Register Allocator for ocamlopt/ocamlnat Benedikt Meurer
2011-08-01 15:03 ` [Caml-list] " Benedikt Meurer
2011-08-01 15:04 ` [Caml-list] " Gabriel Scherer
2011-08-01 16:57   ` Benedikt Meurer
2011-08-24 19:35   ` Benedikt Meurer
2011-08-24 20:40     ` Gerd Stolpmann
2011-08-25  8:02       ` Benedikt Meurer
2011-08-25  9:34         ` Benedikt Meurer
2011-08-25 10:21           ` Gerd Stolpmann
2011-08-25 10:25             ` Pierre-Alexandre Voye
2011-08-26 10:58               ` Guillaume Yziquel
2011-08-26 12:29               ` Erik de Castro Lopo
     [not found]                 ` <CANnJ5GfmnOhk8mJor8PfrGoC1-5vZWOgUKNZMT9k69iu0TBCeA@mail.gmail.com>
     [not found]                   ` <CANnJ5Gf4AnKPDvrKiYx0joiPbQVOCdFWgPR24Y1ZYYL1Qv5=fg@mail.gmail.com>
2011-08-26 18:33                     ` Pierre-Alexandre Voye
2011-08-30 21:07                       ` Fermin Reig
2011-08-30 21:21                         ` Pierre-Alexandre Voye
2011-08-25 10:43             ` Benedikt Meurer
2011-08-24 20:47     ` malc
2011-08-26 12:40     ` Roberto Di Cosmo
2011-08-01 15:29 ` Wojciech Meyer
2011-08-01 17:03   ` Benedikt Meurer

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).