File : a-suewst.adb


   1 ------------------------------------------------------------------------------
   2 --                                                                          --
   3 --                         GNAT RUN-TIME COMPONENTS                         --
   4 --                                                                          --
   5 --                   ADA.STRINGS.UTF_ENCODING.WIDE_STRINGS                  --
   6 --                                                                          --
   7 --                                 B o d y                                  --
   8 --                                                                          --
   9 --          Copyright (C) 2010-2011, Free Software Foundation, Inc.         --
  10 --                                                                          --
  11 -- GNAT is free software;  you can  redistribute it  and/or modify it under --
  12 -- terms of the  GNU General Public License as published  by the Free Soft- --
  13 -- ware  Foundation;  either version 3,  or (at your option) any later ver- --
  14 -- sion.  GNAT is distributed in the hope that it will be useful, but WITH- --
  15 -- OUT ANY WARRANTY;  without even the  implied warranty of MERCHANTABILITY --
  16 -- or FITNESS FOR A PARTICULAR PURPOSE.                                     --
  17 --                                                                          --
  18 --                                                                          --
  19 --                                                                          --
  20 --                                                                          --
  21 --                                                                          --
  22 -- You should have received a copy of the GNU General Public License and    --
  23 -- a copy of the GCC Runtime Library Exception along with this program;     --
  24 -- see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see    --
  25 -- <http://www.gnu.org/licenses/>.                                          --
  26 --                                                                          --
  27 -- GNAT was originally developed  by the GNAT team at  New York University. --
  28 -- Extensive contributions were provided by Ada Core Technologies Inc.      --
  29 --                                                                          --
  30 ------------------------------------------------------------------------------
  31 
  32 package body Ada.Strings.UTF_Encoding.Wide_Strings is
  33    use Interfaces;
  34 
  35    ------------
  36    -- Decode --
  37    ------------
  38 
  39    --  Decode UTF-8/UTF-16BE/UTF-16LE input to Wide_String
  40 
  41    function Decode
  42      (Item         : UTF_String;
  43       Input_Scheme : Encoding_Scheme) return Wide_String
  44    is
  45    begin
  46       if Input_Scheme = UTF_8 then
  47          return Decode (Item);
  48       else
  49          return Decode (To_UTF_16 (Item, Input_Scheme));
  50       end if;
  51    end Decode;
  52 
  53    --  Decode UTF-8 input to Wide_String
  54 
  55    function Decode (Item : UTF_8_String) return Wide_String is
  56       Result : Wide_String (1 .. Item'Length);
  57       --  Result string (worst case is same length as input)
  58 
  59       Len : Natural := 0;
  60       --  Length of result stored so far
  61 
  62       Iptr : Natural;
  63       --  Input Item pointer
  64 
  65       C : Unsigned_8;
  66       R : Unsigned_16;
  67 
  68       procedure Get_Continuation;
  69       --  Reads a continuation byte of the form 10xxxxxx, shifts R left by 6
  70       --  bits, and or's in the xxxxxx to the low order 6 bits. On return Ptr
  71       --  is incremented. Raises exception if continuation byte does not exist
  72       --  or is invalid.
  73 
  74       ----------------------
  75       -- Get_Continuation --
  76       ----------------------
  77 
  78       procedure Get_Continuation is
  79       begin
  80          if Iptr > Item'Last then
  81             Raise_Encoding_Error (Iptr - 1);
  82 
  83          else
  84             C := To_Unsigned_8 (Item (Iptr));
  85             Iptr := Iptr + 1;
  86 
  87             if C not in 2#10_000000# .. 2#10_111111# then
  88                Raise_Encoding_Error (Iptr - 1);
  89             else
  90                R := Shift_Left (R, 6) or Unsigned_16 (C and 2#00_111111#);
  91             end if;
  92          end if;
  93       end Get_Continuation;
  94 
  95    --  Start of processing for Decode
  96 
  97    begin
  98       Iptr := Item'First;
  99 
 100       --  Skip BOM at start
 101 
 102       if Item'Length >= 3
 103         and then Item (Iptr .. Iptr + 2) = BOM_8
 104       then
 105          Iptr := Iptr + 3;
 106 
 107       --  Error if bad BOM
 108 
 109       elsif Item'Length >= 2
 110         and then (Item (Iptr .. Iptr + 1) = BOM_16BE
 111                     or else
 112                   Item (Iptr .. Iptr + 1) = BOM_16LE)
 113       then
 114          Raise_Encoding_Error (Iptr);
 115       end if;
 116 
 117       while Iptr <= Item'Last loop
 118          C := To_Unsigned_8 (Item (Iptr));
 119          Iptr := Iptr + 1;
 120 
 121          --  Codes in the range 16#00# - 16#7F# are represented as
 122          --    0xxxxxxx
 123 
 124          if C <= 16#7F# then
 125             R := Unsigned_16 (C);
 126 
 127          --  No initial code can be of the form 10xxxxxx. Such codes are used
 128          --  only for continuations.
 129 
 130          elsif C <= 2#10_111111# then
 131             Raise_Encoding_Error (Iptr - 1);
 132 
 133          --  Codes in the range 16#80# - 16#7FF# are represented as
 134          --    110yyyxx 10xxxxxx
 135 
 136          elsif C <= 2#110_11111# then
 137             R := Unsigned_16 (C and 2#000_11111#);
 138             Get_Continuation;
 139 
 140          --  Codes in the range 16#800# - 16#FFFF# are represented as
 141          --    1110yyyy 10yyyyxx 10xxxxxx
 142 
 143          elsif C <= 2#1110_1111# then
 144             R := Unsigned_16 (C and 2#0000_1111#);
 145             Get_Continuation;
 146             Get_Continuation;
 147 
 148          --  Codes in the range 16#10000# - 16#10FFFF# are represented as
 149          --    11110zzz 10zzyyyy 10yyyyxx 10xxxxxx
 150 
 151          --  Such codes are out of range for Wide_String output
 152 
 153          else
 154             Raise_Encoding_Error (Iptr - 1);
 155          end if;
 156 
 157          Len := Len + 1;
 158          Result (Len) := Wide_Character'Val (R);
 159       end loop;
 160 
 161       return Result (1 .. Len);
 162    end Decode;
 163 
 164    --  Decode UTF-16 input to Wide_String
 165 
 166    function Decode (Item : UTF_16_Wide_String) return Wide_String is
 167       Result : Wide_String (1 .. Item'Length);
 168       --  Result is same length as input (possibly minus 1 if BOM present)
 169 
 170       Len : Natural := 0;
 171       --  Length of result
 172 
 173       Iptr : Natural;
 174       --  Index of next Item element
 175 
 176       C : Unsigned_16;
 177 
 178    begin
 179       --  Skip UTF-16 BOM at start
 180 
 181       Iptr := Item'First;
 182 
 183       if Item'Length > 0 and then Item (Iptr) = BOM_16 (1) then
 184          Iptr := Iptr + 1;
 185       end if;
 186 
 187       --  Loop through input characters
 188 
 189       while Iptr <= Item'Last loop
 190          C := To_Unsigned_16 (Item (Iptr));
 191          Iptr := Iptr + 1;
 192 
 193          --  Codes in the range 16#0000#..16#D7FF# or 16#E000#..16#FFFD#
 194          --  represent their own value.
 195 
 196          if C <= 16#D7FF# or else C in 16#E000# .. 16#FFFD# then
 197             Len := Len + 1;
 198             Result (Len) := Wide_Character'Val (C);
 199 
 200          --  Codes in the range 16#D800#..16#DBFF# represent the first of the
 201          --  two surrogates used to encode the range 16#01_000#..16#10_FFFF".
 202          --  Such codes are out of range for 16-bit output.
 203 
 204          --  The case of input in the range 16#DC00#..16#DFFF# must never
 205          --  occur, since it means we have a second surrogate character with
 206          --  no corresponding first surrogate.
 207 
 208          --  Codes in the range 16#FFFE# .. 16#FFFF# are also invalid since
 209          --  they conflict with codes used for BOM values.
 210 
 211          --  Thus all remaining codes are invalid
 212 
 213          else
 214             Raise_Encoding_Error (Iptr - 1);
 215          end if;
 216       end loop;
 217 
 218       return Result (1 .. Len);
 219    end Decode;
 220 
 221    ------------
 222    -- Encode --
 223    ------------
 224 
 225    --  Encode Wide_String in UTF-8, UTF-16BE or UTF-16LE
 226 
 227    function Encode
 228      (Item          : Wide_String;
 229       Output_Scheme : Encoding_Scheme;
 230       Output_BOM    : Boolean  := False) return UTF_String
 231    is
 232    begin
 233       --  Case of UTF_8
 234 
 235       if Output_Scheme = UTF_8 then
 236          return Encode (Item, Output_BOM);
 237 
 238       --  Case of UTF_16LE or UTF_16BE, use UTF-16 intermediary
 239 
 240       else
 241          return From_UTF_16 (UTF_16_Wide_String'(Encode (Item)),
 242                              Output_Scheme, Output_BOM);
 243       end if;
 244    end Encode;
 245 
 246    --  Encode Wide_String in UTF-8
 247 
 248    function Encode
 249      (Item       : Wide_String;
 250       Output_BOM : Boolean  := False) return UTF_8_String
 251    is
 252       Result : UTF_8_String (1 .. 3 * Item'Length + 3);
 253       --  Worst case is three bytes per input byte + space for BOM
 254 
 255       Len : Natural;
 256       --  Number of output codes stored in Result
 257 
 258       C : Unsigned_16;
 259       --  Single input character
 260 
 261       procedure Store (C : Unsigned_16);
 262       pragma Inline (Store);
 263       --  Store one output code, C is in the range 0 .. 255
 264 
 265       -----------
 266       -- Store --
 267       -----------
 268 
 269       procedure Store (C : Unsigned_16) is
 270       begin
 271          Len := Len + 1;
 272          Result (Len) := Character'Val (C);
 273       end Store;
 274 
 275    --  Start of processing for UTF8_Encode
 276 
 277    begin
 278       --  Output BOM if required
 279 
 280       if Output_BOM then
 281          Result (1 .. 3) := BOM_8;
 282          Len := 3;
 283       else
 284          Len := 0;
 285       end if;
 286 
 287       --  Loop through characters of input
 288 
 289       for J in Item'Range loop
 290          C := To_Unsigned_16 (Item (J));
 291 
 292          --  Codes in the range 16#00# - 16#7F# are represented as
 293          --    0xxxxxxx
 294 
 295          if C <= 16#7F# then
 296             Store (C);
 297 
 298          --  Codes in the range 16#80# - 16#7FF# are represented as
 299          --    110yyyxx 10xxxxxx
 300 
 301          elsif C <= 16#7FF# then
 302             Store (2#110_00000# or Shift_Right (C, 6));
 303             Store (2#10_000000# or (C and 2#00_111111#));
 304 
 305          --  Codes in the range 16#800# - 16#FFFF# are represented as
 306          --    1110yyyy 10yyyyxx 10xxxxxx
 307 
 308          else
 309             Store (2#1110_0000# or Shift_Right (C, 12));
 310             Store (2#10_000000# or
 311                      Shift_Right (C and 2#111111_000000#, 6));
 312             Store (2#10_000000# or (C and 2#00_111111#));
 313          end if;
 314       end loop;
 315 
 316       return Result (1 .. Len);
 317    end Encode;
 318 
 319    --  Encode Wide_String in UTF-16
 320 
 321    function Encode
 322      (Item       : Wide_String;
 323       Output_BOM : Boolean  := False) return UTF_16_Wide_String
 324    is
 325       Result : UTF_16_Wide_String
 326                  (1 .. Item'Length + Boolean'Pos (Output_BOM));
 327       --  Output is same length as input + possible BOM
 328 
 329       Len : Integer;
 330       --  Length of output string
 331 
 332       C : Unsigned_16;
 333 
 334    begin
 335       --  Output BOM if required
 336 
 337       if Output_BOM then
 338          Result (1) := BOM_16 (1);
 339          Len := 1;
 340       else
 341          Len := 0;
 342       end if;
 343 
 344       --  Loop through input characters encoding them
 345 
 346       for Iptr in Item'Range loop
 347          C := To_Unsigned_16 (Item (Iptr));
 348 
 349          --  Codes in the range 16#0000#..16#D7FF# or 16#E000#..16#FFFD# are
 350          --  output unchanged.
 351 
 352          if C <= 16#D7FF# or else C in 16#E000# .. 16#FFFD# then
 353             Len := Len + 1;
 354             Result (Len) := Wide_Character'Val (C);
 355 
 356          --  Codes in the range 16#D800#..16#DFFF# should never appear in the
 357          --  input, since no valid Unicode characters are in this range (which
 358          --  would conflict with the UTF-16 surrogate encodings). Similarly
 359          --  codes in the range 16#FFFE#..16#FFFF conflict with BOM codes.
 360          --  Thus all remaining codes are illegal.
 361 
 362          else
 363             Raise_Encoding_Error (Iptr);
 364          end if;
 365       end loop;
 366 
 367       return Result;
 368    end Encode;
 369 
 370 end Ada.Strings.UTF_Encoding.Wide_Strings;