File : a-suenst.adb


   1 ------------------------------------------------------------------------------
   2 --                                                                          --
   3 --                         GNAT RUN-TIME COMPONENTS                         --
   4 --                                                                          --
   5 --                     ADA.STRINGS.UTF_ENCODING.STRINGS                     --
   6 --                                                                          --
   7 --                                 B o d y                                  --
   8 --                                                                          --
   9 --          Copyright (C) 2010-2013, Free Software Foundation, Inc.         --
  10 --                                                                          --
  11 -- GNAT is free software;  you can  redistribute it  and/or modify it under --
  12 -- terms of the  GNU General Public License as published  by the Free Soft- --
  13 -- ware  Foundation;  either version 3,  or (at your option) any later ver- --
  14 -- sion.  GNAT is distributed in the hope that it will be useful, but WITH- --
  15 -- OUT ANY WARRANTY;  without even the  implied warranty of MERCHANTABILITY --
  16 -- or FITNESS FOR A PARTICULAR PURPOSE.                                     --
  17 --                                                                          --
  18 --                                                                          --
  19 --                                                                          --
  20 --                                                                          --
  21 --                                                                          --
  22 -- You should have received a copy of the GNU General Public License and    --
  23 -- a copy of the GCC Runtime Library Exception along with this program;     --
  24 -- see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see    --
  25 -- <http://www.gnu.org/licenses/>.                                          --
  26 --                                                                          --
  27 -- GNAT was originally developed  by the GNAT team at  New York University. --
  28 -- Extensive contributions were provided by Ada Core Technologies Inc.      --
  29 --                                                                          --
  30 ------------------------------------------------------------------------------
  31 
  32 package body Ada.Strings.UTF_Encoding.Strings is
  33    use Interfaces;
  34 
  35    ------------
  36    -- Decode --
  37    ------------
  38 
  39    --  Decode UTF-8/UTF-16BE/UTF-16LE input to String
  40 
  41    function Decode
  42      (Item         : UTF_String;
  43       Input_Scheme : Encoding_Scheme) return String
  44    is
  45    begin
  46       if Input_Scheme = UTF_8 then
  47          return Decode (Item);
  48       else
  49          return Decode (To_UTF_16 (Item, Input_Scheme));
  50       end if;
  51    end Decode;
  52 
  53    --  Decode UTF-8 input to String
  54 
  55    function Decode (Item : UTF_8_String) return String is
  56       Result : String (1 .. Item'Length);
  57       --  Result string (worst case is same length as input)
  58 
  59       Len : Natural := 0;
  60       --  Length of result stored so far
  61 
  62       Iptr : Natural;
  63       --  Input Item pointer
  64 
  65       C : Unsigned_8;
  66       R : Unsigned_16;
  67 
  68       procedure Get_Continuation;
  69       --  Reads a continuation byte of the form 10xxxxxx, shifts R left
  70       --  by 6 bits, and or's in the xxxxxx to the low order 6 bits. On
  71       --  return Ptr is incremented. Raises exception if continuation
  72       --  byte does not exist or is invalid.
  73 
  74       ----------------------
  75       -- Get_Continuation --
  76       ----------------------
  77 
  78       procedure Get_Continuation is
  79       begin
  80          if Iptr > Item'Last then
  81             Raise_Encoding_Error (Iptr - 1);
  82 
  83          else
  84             C := To_Unsigned_8 (Item (Iptr));
  85             Iptr := Iptr + 1;
  86 
  87             if C not in 2#10_000000# .. 2#10_111111# then
  88                Raise_Encoding_Error (Iptr - 1);
  89             else
  90                R := Shift_Left (R, 6) or Unsigned_16 (C and 2#00_111111#);
  91             end if;
  92          end if;
  93       end Get_Continuation;
  94 
  95    --  Start of processing for Decode
  96 
  97    begin
  98       Iptr := Item'First;
  99 
 100       --  Skip BOM at start
 101 
 102       if Item'Length >= 3
 103         and then Item (Iptr .. Iptr + 2) = BOM_8
 104       then
 105          Iptr := Iptr + 3;
 106 
 107       --  Error if bad BOM
 108 
 109       elsif Item'Length >= 2
 110         and then (Item (Iptr .. Iptr + 1) = BOM_16BE
 111                     or else
 112                   Item (Iptr .. Iptr + 1) = BOM_16LE)
 113       then
 114          Raise_Encoding_Error (Iptr);
 115       end if;
 116 
 117       while Iptr <= Item'Last loop
 118          C := To_Unsigned_8 (Item (Iptr));
 119          Iptr := Iptr + 1;
 120 
 121          --  Codes in the range 16#00# - 16#7F# are represented as
 122          --    0xxxxxxx
 123 
 124          if C <= 16#7F# then
 125             R := Unsigned_16 (C);
 126 
 127          --  No initial code can be of the form 10xxxxxx. Such codes are used
 128          --  only for continuations.
 129 
 130          elsif C <= 2#10_111111# then
 131             Raise_Encoding_Error (Iptr - 1);
 132 
 133          --  Codes in the range 16#80# - 16#7FF# are represented as
 134          --    110yyyxx 10xxxxxx
 135 
 136          elsif C <= 2#110_11111# then
 137             R := Unsigned_16 (C and 2#000_11111#);
 138             Get_Continuation;
 139 
 140          --  Codes in the range 16#800# - 16#FFFF# are represented as
 141          --    1110yyyy 10yyyyxx 10xxxxxx
 142 
 143          --  Such codes are out of range for type Character
 144 
 145          --  Codes in the range 16#10000# - 16#10FFFF# are represented as
 146          --    11110zzz 10zzyyyy 10yyyyxx 10xxxxxx
 147 
 148          --  Such codes are out of range for Wide_String output
 149 
 150          --  Thus all remaining cases raise Encoding_Error
 151 
 152          else
 153             Raise_Encoding_Error (Iptr - 1);
 154          end if;
 155 
 156          Len := Len + 1;
 157 
 158          --  The value may still be out of range of Standard.Character. We make
 159          --  the check explicit because the library is typically compiled with
 160          --  range checks disabled.
 161 
 162          if R > Character'Pos (Character'Last) then
 163             Raise_Encoding_Error (Iptr - 1);
 164          end if;
 165 
 166          Result (Len) := Character'Val (R);
 167       end loop;
 168 
 169       return Result (1 .. Len);
 170    end Decode;
 171 
 172    --  Decode UTF-16 input to String
 173 
 174    function Decode (Item : UTF_16_Wide_String) return String is
 175       Result : String (1 .. Item'Length);
 176       --  Result is same length as input (possibly minus 1 if BOM present)
 177 
 178       Len : Natural := 0;
 179       --  Length of result
 180 
 181       Iptr : Natural;
 182       --  Index of next Item element
 183 
 184       C : Unsigned_16;
 185 
 186    begin
 187       --  Skip UTF-16 BOM at start
 188 
 189       Iptr := Item'First;
 190 
 191       if Item'Length > 0 and then Item (Iptr) = BOM_16 (1) then
 192          Iptr := Iptr + 1;
 193       end if;
 194 
 195       --  Loop through input characters
 196 
 197       while Iptr <= Item'Last loop
 198          C := To_Unsigned_16 (Item (Iptr));
 199          Iptr := Iptr + 1;
 200 
 201          --  Codes in the range 16#0000#..16#00FF# represent their own value
 202 
 203          if C <= 16#00FF# then
 204             Len := Len + 1;
 205             Result (Len) := Character'Val (C);
 206 
 207          --  All other codes are invalid, either they are invalid UTF-16
 208          --  encoding sequences, or they represent values that are out of
 209          --  range for type Character.
 210 
 211          else
 212             Raise_Encoding_Error (Iptr - 1);
 213          end if;
 214       end loop;
 215 
 216       return Result (1 .. Len);
 217    end Decode;
 218 
 219    ------------
 220    -- Encode --
 221    ------------
 222 
 223    --  Encode String in UTF-8, UTF-16BE or UTF-16LE
 224 
 225    function Encode
 226      (Item          : String;
 227       Output_Scheme : Encoding_Scheme;
 228       Output_BOM    : Boolean  := False) return UTF_String
 229    is
 230    begin
 231       --  Case of UTF_8
 232 
 233       if Output_Scheme = UTF_8 then
 234          return Encode (Item, Output_BOM);
 235 
 236       --  Case of UTF_16LE or UTF_16BE, use UTF-16 intermediary
 237 
 238       else
 239          return From_UTF_16 (UTF_16_Wide_String'(Encode (Item)),
 240                              Output_Scheme, Output_BOM);
 241       end if;
 242    end Encode;
 243 
 244    --  Encode String in UTF-8
 245 
 246    function Encode
 247      (Item       : String;
 248       Output_BOM : Boolean  := False) return UTF_8_String
 249    is
 250       Result : UTF_8_String (1 .. 3 * Item'Length + 3);
 251       --  Worst case is three bytes per input byte + space for BOM
 252 
 253       Len : Natural;
 254       --  Number of output codes stored in Result
 255 
 256       C : Unsigned_8;
 257       --  Single input character
 258 
 259       procedure Store (C : Unsigned_8);
 260       pragma Inline (Store);
 261       --  Store one output code, C is in the range 0 .. 255
 262 
 263       -----------
 264       -- Store --
 265       -----------
 266 
 267       procedure Store (C : Unsigned_8) is
 268       begin
 269          Len := Len + 1;
 270          Result (Len) := Character'Val (C);
 271       end Store;
 272 
 273    --  Start of processing for UTF8_Encode
 274 
 275    begin
 276       --  Output BOM if required
 277 
 278       if Output_BOM then
 279          Result (1 .. 3) := BOM_8;
 280          Len := 3;
 281       else
 282          Len := 0;
 283       end if;
 284 
 285       --  Loop through characters of input
 286 
 287       for J in Item'Range loop
 288          C := To_Unsigned_8 (Item (J));
 289 
 290          --  Codes in the range 16#00# - 16#7F# are represented as
 291          --    0xxxxxxx
 292 
 293          if C <= 16#7F# then
 294             Store (C);
 295 
 296          --  Codes in the range 16#80# - 16#7FF# are represented as
 297          --    110yyyxx 10xxxxxx
 298 
 299          --  For type character of course, the limit is 16#FF# in any case
 300 
 301          else
 302             Store (2#110_00000# or Shift_Right (C, 6));
 303             Store (2#10_000000# or (C and 2#00_111111#));
 304          end if;
 305       end loop;
 306 
 307       return Result (1 .. Len);
 308    end Encode;
 309 
 310    --  Encode String in UTF-16
 311 
 312    function Encode
 313      (Item       : String;
 314       Output_BOM : Boolean  := False) return UTF_16_Wide_String
 315    is
 316       Result : UTF_16_Wide_String
 317                  (1 .. Item'Length + Boolean'Pos (Output_BOM));
 318       --  Output is same length as input + possible BOM
 319 
 320       Len : Integer;
 321       --  Length of output string
 322 
 323       C : Unsigned_8;
 324 
 325    begin
 326       --  Output BOM if required
 327 
 328       if Output_BOM then
 329          Result (1) := BOM_16 (1);
 330          Len := 1;
 331       else
 332          Len := 0;
 333       end if;
 334 
 335       --  Loop through input characters encoding them
 336 
 337       for Iptr in Item'Range loop
 338          C := To_Unsigned_8 (Item (Iptr));
 339 
 340          --  Codes in the range 16#0000#..16#00FF# are output unchanged. This
 341          --  includes all possible cases of Character values.
 342 
 343          Len := Len + 1;
 344          Result (Len) := Wide_Character'Val (C);
 345       end loop;
 346 
 347       return Result;
 348    end Encode;
 349 
 350 end Ada.Strings.UTF_Encoding.Strings;