caml-list - the Caml user's mailing list
 help / color / mirror / Atom feed
From: skaller <skaller@users.sourceforge.net>
To: Vincent Hanquez <tab@snarc.org>
Cc: caml-list@yquem.inria.fr, Chris King <colanderman@gmail.com>
Subject: Re: [Caml-list] Re: Rope is the new string
Date: Fri, 12 Oct 2007 00:48:16 +1000	[thread overview]
Message-ID: <1192114097.6184.7.camel@rosella.wigram> (raw)
In-Reply-To: <20071011142141.GA8001@snarc.org>

On Thu, 2007-10-11 at 16:21 +0200, Vincent Hanquez wrote:
> On Thu, Oct 11, 2007 at 11:54:24PM +1000, skaller wrote:
> > You can't: Camomile is massive for a reason.. the problem it
> > aims to solve is complex and hard to do efficiently without
> > a large set of specialised functions.
> 
> You are assuming that i want efficiency where i want to print few
> unicode string in an ui here and there. I *DON'T* want to be exposed to
> full unicode, i need something like 1/100 of camomile library.

In that case, you can use an int Array.t for Unicode provided 
it is only 31 bit OR you have a 64 bit machine. These routines 
should help converting to and from UTF-8:


(* parse the first utf8 encoded character of a string s
  starting at index position i, return a pair
  consisting of the decoded integers, and the position 
  of the first character not decoded.

  If the first character is bad, it is returned,
  otherwise if the encoding is bad, the result is
  an unspecified value.

  Fails if the index is past or at
  the end of the string.

  COMPATIBILITY NOTE: if this function is called
  with a SINGLE character string, it will return
  the usual value for the character, in range
  0 .. 255
*)

let parse_utf8 (s : string)  (i : int) : int * int =
  let ord = int_of_char 
  and n = (String.length s)  - i
  in 
  if n <= 0 then 
    failwith 
    (
      "parse_utf8: index "^ string_of_int i^
      " >= "^string_of_int (String.length s)^
      " = length of '" ^ s ^ "'"
    )
  else let lead = ord (s.[i]) in
    if (lead land 0x80) = 0 then 
      lead land 0x7F,i+1 (* ASCII *)
    else if lead land 0xE0 = 0xC0 && n > 1 then
      ((lead land 0x1F)  lsl  6) lor
        (ord(s.[i+1]) land 0x3F),i+2
    else if lead land 0xF0 = 0xE0 && n > 2 then
      ((lead land 0x1F) lsl 12) lor
        ((ord(s.[i+1]) land 0x3F)  lsl 6) lor
        (ord(s.[i+2]) land 0x3F),i+3
    else if lead land 0xF8 = 0xF0 && n > 3 then
      ((lead land 0x1F) lsl 18) lor
        ((ord(s.[i+1]) land 0x3F)  lsl 12) lor
        ((ord(s.[i+2]) land 0x3F)  lsl 6) lor
        (ord(s.[i+3]) land 0x3F),i+4
    else if lead land 0xFC = 0xF8 && n > 4 then
      ((lead land 0x1F) lsl 24) lor 
        ((ord(s.[i+1]) land 0x3F)  lsl 18) lor
        ((ord(s.[i+2]) land 0x3F)  lsl 12) lor
        ((ord(s.[i+3]) land 0x3F)  lsl 6) lor
        (ord(s.[i+4]) land 0x3F),i+5
    else if lead land 0xFE = 0xFC && n > 5 then
      ((lead land 0x1F) lsl 30) lor
        ((ord(s.[i+1]) land 0x3F)  lsl 24) lor
        ((ord(s.[i+2]) land 0x3F)  lsl 18) lor
        ((ord(s.[i+3]) land 0x3F)  lsl 12) lor
        ((ord(s.[i+4]) land 0x3F)  lsl 6) lor
        (ord(s.[i+5]) land 0x3F),i+6
    else lead, i+1  (* error, just use bad character *)


(* convert an integer into a utf-8 encoded string of bytes *)
let utf8_of_int i =
  let chr x = String.make 1 (Char.chr x) in
  if i < 0x80 then 
     chr(i)
  else if i < 0x800 then 
     chr(0xC0 lor ((i lsr 6) land 0x1F))  ^
      chr(0x80 lor (i land 0x3F))
  else if i < 0x10000 then 
     chr(0xE0 lor ((i lsr 12) land 0xF)) ^
      chr(0x80 lor ((i lsr 6) land 0x3F)) ^
      chr(0x80 lor (i land 0x3F))
  else if i < 0x200000 then 
     chr(0xF0 lor ((i lsr 18) land 0x7)) ^
      chr(0x80 lor ((i lsr 12) land 0x3F)) ^
      chr(0x80 lor ((i lsr 6) land 0x3F)) ^
      chr(0x80 lor (i land 0x3F))
  else if i < 0x4000000 then 
     chr(0xF8 lor ((i lsr 24) land 0x3)) ^
      chr(0x80 lor ((i lsr 18) land 0x3F)) ^
      chr(0x80 lor ((i lsr 12) land 0x3F)) ^
      chr(0x80 lor ((i lsr 6) land 0x3F)) ^
      chr(0x80 lor (i land 0x3F))
  else chr(0xFC lor ((i lsr 30) land 0x1)) ^
    chr(0x80 lor ((i lsr 24) land 0x3F)) ^
    chr(0x80 lor ((i lsr 18) land 0x3F)) ^
    chr(0x80 lor ((i lsr 12) land 0x3F)) ^
    chr(0x80 lor ((i lsr 6) land 0x3F)) ^
    chr(0x80 lor (i land 0x3F))



-- 
John Skaller <skaller at users dot sf dot net>
Felix, successor to C++: http://felix.sf.net


  parent reply	other threads:[~2007-10-11 14:48 UTC|newest]

Thread overview: 51+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2007-10-08 15:08 Correct way of programming a CGI script Tom
2007-10-08 15:32 ` [Caml-list] " Dario Teixeira
2007-10-08 16:04 ` Gerd Stolpmann
2007-10-08 21:37   ` skaller
2007-10-08 22:21     ` Erik de Castro Lopo
2007-10-08 23:05       ` skaller
2007-10-08 23:19         ` skaller
2007-10-08 23:23           ` Arnaud Spiwack
2007-10-08 23:47             ` skaller
2007-10-09  5:49         ` David Teller
2007-10-09 10:15         ` Christophe TROESTLER
2007-10-09 15:29           ` skaller
2007-10-09 15:49             ` Vincent Hanquez
2007-10-09 16:00               ` Jon Harrop
2007-10-09 14:02         ` William D. Neumann
2007-10-09 15:25           ` skaller
2007-10-09 15:33             ` William D. Neumann
2007-10-09 15:48             ` Jon Harrop
2007-10-08 23:37       ` skaller
2007-10-09 10:20         ` Christophe TROESTLER
2007-10-09 13:40           ` Rope is the new string Jon Harrop
2007-10-09 15:57             ` [Caml-list] " Vincent Hanquez
2007-10-09 16:42               ` Loup Vaillant
2007-10-09 16:55                 ` Vincent Hanquez
2007-10-09 17:32                   ` Loup Vaillant
2007-10-09 19:51                     ` Vincent Hanquez
2007-10-09 21:06                       ` Loup Vaillant
2007-10-10  7:35                         ` Vincent Hanquez
2007-10-10  8:05                           ` Loup Vaillant
2007-10-11 13:23                             ` Vincent Hanquez
2007-10-09 22:04                       ` Chris King
2007-10-11 13:03                         ` Vincent Hanquez
2007-10-11 13:54                           ` skaller
2007-10-11 14:21                             ` Vincent Hanquez
2007-10-11 14:27                               ` Benjamin Monate
2007-10-11 14:48                               ` skaller [this message]
2007-10-11 21:16                                 ` Alain Frisch
2007-10-15 20:35                                 ` Warning on home-made functions dealing with UTF-8 Julien Moutinho
2007-10-15 23:51                                   ` [Caml-list] " skaller
2007-10-16  2:21                                     ` Julien Moutinho
2007-10-16 18:46                                   ` Julien Moutinho
2007-10-16 18:51                                     ` Julien Moutinho
2007-10-17  2:23                                     ` [Caml-list] " skaller
2007-10-09 10:26     ` [Caml-list] Correct way of programming a CGI script Gerd Stolpmann
2007-10-09 15:16       ` skaller
2007-10-09 15:31         ` William D. Neumann
2007-10-09 12:52     ` Brian Hurt
2007-10-09 13:56   ` Jon Harrop
2007-10-09 15:18     ` William D. Neumann
2007-10-08 16:11 ` Loup Vaillant
2007-10-08 19:07   ` Christophe TROESTLER

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=1192114097.6184.7.camel@rosella.wigram \
    --to=skaller@users.sourceforge.net \
    --cc=caml-list@yquem.inria.fr \
    --cc=colanderman@gmail.com \
    --cc=tab@snarc.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).