module Stream = struct module type Read = sig type t type symbol val get: t -> symbol end module type Write = sig type t type symbol val put: t -> symbol -> unit end end module Block = struct module type Read = sig type t val read: t -> string -> int -> int -> int end module type Write = sig type t val write: t -> string -> int -> int -> unit end end module Filter = struct module type Read = sig include Stream.Read type source val flush: t -> unit val attach: source -> t end module type Write = sig include Stream.Write type dest val flush: t -> unit val attach: dest -> t end end module File = struct module Read = struct type file = Pervasives.in_channel type t = file let _open name :t = Pervasives.open_in_bin name let close (t:t) = Pervasives.close_in t let seek = seek_in let pos = pos_in type symbol = char let get (t:t) = Pervasives.input_char t let read (t:t) ic buf pos = Pervasives.input t ic buf pos end module Write = struct type file = Pervasives.out_channel type t = file let _open name :t = Pervasives.open_out_bin name let close (t:t) = Pervasives.close_out t let seek = seek_out let pos = pos_out type symbol = char let put (t:t) ch = Pervasives.output_char t ch let write = Pervasives.output end end module Buffer = struct module Read(B:Block.Read) : (Filter.Read with type symbol=char and type source=B.t) = struct type t = { buf : String.t; mutable pos : int; mutable level: int; source: B.t } type source = B.t type symbol = char let attach b = let blen = 1024 in { buf = String.create blen; pos = 0; level = 0; source = b } let get t = if t.pos = t.level then begin match B.read t.source t.buf 0 (String.length t.buf) with 0 -> raise End_of_file | n -> t.pos <- 0; t.level <- n end; let ch = t.buf.[t.pos] in t.pos <- t.pos + 1; ch let flush t = t.pos <- 0; t.level <- 0 end module Write(B:Block.Write) : (Filter.Write with type symbol=char and type dest=B.t) = struct type dest = B.t type symbol = char type t = { buf : String.t; mutable pos : int; dest: B.t } let attach t = { buf = String.create 256; pos = 0; dest = t; } let flush t = B.write t.dest t.buf 0 t.pos; t.pos <- 0 let put t ch = t.buf.[t.pos] <- ch; t.pos <- t.pos + 1; if t.pos >= String.length t.buf then flush t end end module Socket = struct type sock = Unix.file_descr type t = sock let create ?(domain=Unix.PF_INET) ?(protocol=0) _type :t = Unix.socket domain _type protocol let close = Unix.close module Read = struct type t = sock let read = Unix.read let shutdown s = Unix.shutdown s Unix.SHUTDOWN_RECEIVE end module Write = struct type t = sock let rec write t buf off len = match Unix.write t buf off len with n when n=len -> () | n -> write t buf (off+n) (len-n) let shutdown s = Unix.shutdown s Unix.SHUTDOWN_SEND end let connect ?sock addr :(Read.t*Write.t) = let s = match sock with Some sock -> sock | None -> create Unix.SOCK_STREAM in Unix.connect s addr; (s,s) end module UTF8 = struct exception InvalidSymbol type utf8 = int type t = utf8 module Read(Src:Stream.Read with type symbol=char) : (Filter.Read with type symbol=utf8 and type source=Src.t) = struct type symbol = utf8 type t = Src.t type source = Src.t let attach (t:Src.t) : t = t let flush t = () let get t = let next t = let ch = int_of_char (Src.get t) in if (ch land 0xC0) = 0x80 then ch land 0x3F else raise InvalidSymbol in let ch0 = int_of_char (Src.get t) in if ch0 < 0x80 then ch0 else if ch0 < 0xE0 then let ch1 = next t in let ch = ((ch0 land 0x1F) lsl 6) lor ch1 in if ch < 0x80 then raise InvalidSymbol else ch else if ch0 < 0xF0 then let ch1 = next t in let ch2 = next t in let ch = ((ch0 land 0x0F) lsl 12) lor (ch1 lsl 6) lor ch2 in if ch < 0x800 then raise InvalidSymbol else ch else if ch0 < 0xF8 then let ch1 = next t in let ch2 = next t in let ch3 = next t in let ch = ((ch0 land 0x03) lsl 18) lor (ch1 lsl 12) lor (ch2 lsl 6) lor ch3 in if ch < 0x10000 then raise InvalidSymbol else ch else raise InvalidSymbol end module Write(Dst:Stream.Write with type symbol=char): (Filter.Write with type symbol=utf8 and type dest=Dst.t) = struct type symbol = utf8 type dest = Dst.t type t = Dst.t let attach t = t let flush t = () let put t ch = if ch < 0x80 then Dst.put t (char_of_int ch) else if ch < 0x800 then begin Dst.put t (char_of_int (0xC0 lor (ch lsr 6))); Dst.put t (char_of_int (0x80 lor (ch land 0x3F))) end else if ch < 0x10000 then begin Dst.put t (char_of_int (0xE0 lor (ch lsr 12))); Dst.put t (char_of_int (0x80 lor ((ch lsr 6) land 0x3F))); Dst.put t (char_of_int (0x80 lor (ch land 0x3F))) end else if ch < 0x110000 then begin Dst.put t (char_of_int (0xF0 lor (ch lsr 18))); Dst.put t (char_of_int (0x80 lor ((ch lsr 12) land 0x3F))); Dst.put t (char_of_int (0x80 lor ((ch lsr 6) land 0x3F))); Dst.put t (char_of_int (0x80 lor (ch land 0x3F))) end else raise InvalidSymbol end end module type Type = sig type t end module Char = struct type t = char end module Copy (T:Type) (Src:Stream.Read with type symbol=T.t) (Dst:Stream.Write with type symbol=T.t) = struct let run s d = while true do Dst.put d (Src.get s) done end let copy_byte src dst = let module CopyFile = Copy(Char) (File.Read) (File.Write) in let src = File.Read._open src and dst = File.Write._open dst in try CopyFile.run src dst with End_of_file -> File.Read.close src; File.Write.close dst module FileWriteUtf8 = UTF8.Write (File.Write) let copy_utf src dst = let module FileReadUtf8 = UTF8.Read (File.Read) in let module CopyFile = Copy(UTF8) (FileReadUtf8) (FileWriteUtf8) in let src = File.Read._open src and dst = File.Write._open dst in let src8 = FileReadUtf8.attach src and dst8 = FileWriteUtf8.attach dst in try CopyFile.run src8 dst8 with End_of_file -> File.Read.close src; File.Write.close dst module BufferedFileRead = Buffer.Read(File.Read) module BufferedFileReadUtf8 = UTF8.Read(BufferedFileRead) module FileReadUtf8 = UTF8.Read(File.Read) let copy_utf2 src dst = let module CopyFile = Copy (UTF8) (BufferedFileReadUtf8) (FileWriteUtf8) in let src = File.Read._open src and dst = File.Write._open dst in let srcb = BufferedFileRead.attach src in let src8 = BufferedFileReadUtf8.attach srcb and dst8 = FileWriteUtf8.attach dst in try CopyFile.run src8 dst8 with End_of_file -> File.Read.close src; File.Write.close dst module Utf2Ascii (Dst:Stream.Write with type symbol=char ) = struct type symbol = UTF8.t type t = Dst.t type dest = Dst.t let attach (t:Dst.t) :t = t let put t ch = match ch with ch when ch < 0x80 -> Dst.put t (char_of_int ch) | _ -> Dst.put t ' ' end let copy_utf2ascii src dst = let module FileWriteAscii = Utf2Ascii(File.Write) in let module CopyFile = Copy (UTF8) (BufferedFileReadUtf8) (FileWriteAscii) in let src = File.Read._open src and dst = File.Write._open dst in let srcb = BufferedFileRead.attach src in let src8 = BufferedFileReadUtf8.attach srcb in let dsta = FileWriteAscii.attach dst in try CopyFile.run src8 dsta with End_of_file -> File.Read.close src; File.Write.close dst module Static = struct type symbol = char type t = { mutable pos: int; mutable buf: String.t list; mutable cur: String.t } let attach l = { pos = 0; buf = l; cur = ""; } let rec get t = if t.pos < String.length t.cur then begin let ch = t.cur.[t.pos] in t.pos <- t.pos + 1; ch end else match t.buf with hd::tl -> t.buf <- tl; t.pos <- 0; t.cur <- hd; get t | [] -> raise End_of_file end let copy_static dst = let module CopyFile = Copy (Char) (Static) (File.Write) in let dst = File.Write._open dst and src = Static.attach ["Hello";"\n";"World";"\n"] in try CopyFile.run src dst with End_of_file -> File.Write.close dst module StreamSockRead = Buffer.Read(Socket.Read) module StreamSockWrite = Buffer.Write(Socket.Write) let get_utf8 host url dst = let addr = Unix.ADDR_INET((Unix.gethostbyname host).Unix.h_addr_list.(0), 80) in let (sr,sw) = Socket.connect addr in let req = Static.attach ["GET"; " ";url; " "; "HTTP/0.9";"\n";"\n"] in let sock_wr = StreamSockWrite.attach sw and sock_rd = StreamSockRead.attach sr in let module SendReq = Copy (Char) (Static) (StreamSockWrite) in try SendReq.run req sock_wr with End_of_file -> StreamSockWrite.flush sock_wr; let dst = File.Write._open dst in let module GetFile = Copy (Char) (StreamSockRead) (File.Write) in try GetFile.run sock_rd dst with End_of_file -> Socket.close sr; File.Write.close dst module Length (T:Type) (Src:Stream.Read with type symbol=T.t) = struct let get s = let len = ref 0 in try while true do ignore (Src.get s); incr len done; -1 with End_of_file -> !len end let length file = let src = File.Read._open file in let module FileCharLen = Length (Char) (File.Read) in let len = FileCharLen.get src in File.Read.seek src 0; let module FileUtf8Len = Length (UTF8) (FileReadUtf8) in let usrc = FileReadUtf8.attach src in let len8 = FileUtf8Len.get usrc in File.Read.close src; len,len8 let run () = let html = "utf8.html" in List.iter ( fun (desc,op,file) -> print_endline desc; op file; let (bytes,symbols) = length file in Printf.printf "Done. File '%s'; bytes %d, symbols %d\n" file bytes symbols ) [ "Hello World", copy_static, "static.dat"; "Getting corpse from a web ..", (get_utf8 "www.columbia.edu" "/kermit/utf8.html" ), html; "Copying corpse using bytecopy", (copy_byte html), "copy.dat"; "Copying corpse using utf8 symbols", (copy_utf html), "copy_utf8.dat"; "Copying corpse using utf8 symbols and read buffer", (copy_utf2 html), "copy_utf8_2.dat"; "Converting corpse to ASCII", (copy_utf2ascii html), "ascii.dat"; ] let _ = run ()