Decoding UTF-8 streams

UTF-8 is one of the most commonly used methods to encode unicode characters into byte values. It has some interesting properties, for example, characters from the ASCII codeset retain their encoding. This is an implementation of UTF-8 decoding in OCaml.

The interface of the decoder is:

val utf8_decode : char Stream.t -> int Stream.t

The decoding works lazily on streams, i.e. the resulting stream will decode characters as they are requested. In case of illegal input, the code will raise stream errors.

type byte = Single of int | Cont of int | Leading of int * int | Invalid
 
(** [classify chr] returns the {!byte} corresponding to [chr] *)
let classify chr =
    let c = int_of_char chr in
    (* Classify byte according to leftmost 0 bit *)
    if c land 0x80 = 0 then Single c
    else if c land 0x40 = 0 then Cont (c land 0x3f)
    else if c land 0x20 = 0 then Leading (1, c land 0x1f)
    else if c land 0x10 = 0 then Leading (2, c land 0x0f)
    else if c land 0x08 = 0 then Leading (3, c land 0x07)
    else if c land 0x04 = 0 then Leading (4, c land 0x03)
    else if c land 0x02 = 0 then Leading (5, c land 0x01)
    else Invalid
 
(** [follow strm n c] returns the code point based on [c] plus [n] continuation
    bytes taken from [strm] *)
let rec follow strm n c =
    if n = 0 then c
    else
        (match classify (Stream.next strm) with
        | Cont cc -> follow strm (n-1) ((c lsl 6) lor (cc land 0x3f))
        | _ -> raise (Stream.Error "Continuation byte expected"))
 
(** [utf8_decode strm] returns a code point stream that will lazily decode
    the byte stream [strm] *)
let rec utf8_decode strm =
    Stream.slazy (fun () ->
        match Stream.peek strm with
        | Some chr ->
            Stream.junk strm;
            (match classify chr with
            | Single c -> Stream.icons c (utf8_decode strm)
            | Cont _ -> raise (Stream.Error "Unexpected continuation byte")
            | Leading (n, c) ->
                let c = follow strm n c in
                Stream.icons c (utf8_decode strm)
            | Invalid -> raise (Stream.Error "Invalid byte"))
        | None -> Stream.sempty)

Posted at the occassion of 20 years of UTF-8.

Leave a Reply

Your email address will not be published. Required fields are marked *

You may use these HTML tags and attributes: <a href="" title=""> <abbr title=""> <acronym title=""> <b> <blockquote cite=""> <cite> <code> <del datetime=""> <em> <i> <q cite=""> <strike> <strong>