File : a-suezst.adb


   1 ------------------------------------------------------------------------------
   2 --                                                                          --
   3 --                         GNAT RUN-TIME COMPONENTS                         --
   4 --                                                                          --
   5 --                ADA.STRINGS.UTF_ENCODING.WIDE_WIDE_STRINGS                --
   6 --                                                                          --
   7 --                                 B o d y                                  --
   8 --                                                                          --
   9 --          Copyright (C) 2010-2011, Free Software Foundation, Inc.         --
  10 --                                                                          --
  11 -- GNAT is free software;  you can  redistribute it  and/or modify it under --
  12 -- terms of the  GNU General Public License as published  by the Free Soft- --
  13 -- ware  Foundation;  either version 3,  or (at your option) any later ver- --
  14 -- sion.  GNAT is distributed in the hope that it will be useful, but WITH- --
  15 -- OUT ANY WARRANTY;  without even the  implied warranty of MERCHANTABILITY --
  16 -- or FITNESS FOR A PARTICULAR PURPOSE.                                     --
  17 --                                                                          --
  18 --                                                                          --
  19 --                                                                          --
  20 --                                                                          --
  21 --                                                                          --
  22 -- You should have received a copy of the GNU General Public License and    --
  23 -- a copy of the GCC Runtime Library Exception along with this program;     --
  24 -- see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see    --
  25 -- <http://www.gnu.org/licenses/>.                                          --
  26 --                                                                          --
  27 -- GNAT was originally developed  by the GNAT team at  New York University. --
  28 -- Extensive contributions were provided by Ada Core Technologies Inc.      --
  29 --                                                                          --
  30 ------------------------------------------------------------------------------
  31 
  32 package body Ada.Strings.UTF_Encoding.Wide_Wide_Strings is
  33    use Interfaces;
  34 
  35    ------------
  36    -- Decode --
  37    ------------
  38 
  39    --  Decode UTF-8/UTF-16BE/UTF-16LE input to Wide_Wide_String
  40 
  41    function Decode
  42      (Item         : UTF_String;
  43       Input_Scheme : Encoding_Scheme) return Wide_Wide_String
  44    is
  45    begin
  46       if Input_Scheme = UTF_8 then
  47          return Decode (Item);
  48       else
  49          return Decode (To_UTF_16 (Item, Input_Scheme));
  50       end if;
  51    end Decode;
  52 
  53    --  Decode UTF-8 input to Wide_Wide_String
  54 
  55    function Decode (Item : UTF_8_String) return Wide_Wide_String is
  56       Result : Wide_Wide_String (1 .. Item'Length);
  57       --  Result string (worst case is same length as input)
  58 
  59       Len : Natural := 0;
  60       --  Length of result stored so far
  61 
  62       Iptr : Natural;
  63       --  Input string pointer
  64 
  65       C : Unsigned_8;
  66       R : Unsigned_32;
  67 
  68       procedure Get_Continuation;
  69       --  Reads a continuation byte of the form 10xxxxxx, shifts R left by 6
  70       --  bits, and or's in the xxxxxx to the low order 6 bits. On return Ptr
  71       --  is incremented. Raises exception if continuation byte does not exist
  72       --  or is invalid.
  73 
  74       ----------------------
  75       -- Get_Continuation --
  76       ----------------------
  77 
  78       procedure Get_Continuation is
  79       begin
  80          if Iptr > Item'Last then
  81             Raise_Encoding_Error (Iptr - 1);
  82 
  83          else
  84             C := To_Unsigned_8 (Item (Iptr));
  85             Iptr := Iptr + 1;
  86 
  87             if C not in 2#10_000000# .. 2#10_111111# then
  88                Raise_Encoding_Error (Iptr - 1);
  89             else
  90                R := Shift_Left (R, 6) or Unsigned_32 (C and 2#00_111111#);
  91             end if;
  92          end if;
  93       end Get_Continuation;
  94 
  95    --  Start of processing for Decode
  96 
  97    begin
  98       Iptr := Item'First;
  99 
 100       --  Skip BOM at start
 101 
 102       if Item'Length >= 3
 103         and then Item (Iptr .. Iptr + 2) = BOM_8
 104       then
 105          Iptr := Iptr + 3;
 106 
 107       --  Error if bad BOM
 108 
 109       elsif Item'Length >= 2
 110         and then (Item (Iptr .. Iptr + 1) = BOM_16BE
 111                     or else
 112                   Item (Iptr .. Iptr + 1) = BOM_16LE)
 113       then
 114          Raise_Encoding_Error (Iptr);
 115       end if;
 116 
 117       --  Loop through input characters
 118 
 119       while Iptr <= Item'Last loop
 120          C := To_Unsigned_8 (Item (Iptr));
 121          Iptr := Iptr + 1;
 122 
 123          --  Codes in the range 16#00# - 16#7F# are represented as
 124          --    0xxxxxxx
 125 
 126          if C <= 16#7F# then
 127             R := Unsigned_32 (C);
 128 
 129          --  No initial code can be of the form 10xxxxxx. Such codes are used
 130          --  only for continuations.
 131 
 132          elsif C <= 2#10_111111# then
 133             Raise_Encoding_Error (Iptr - 1);
 134 
 135          --  Codes in the range 16#80# - 16#7FF# are represented as
 136          --    110yyyxx 10xxxxxx
 137 
 138          elsif C <= 2#110_11111# then
 139             R := Unsigned_32 (C and 2#000_11111#);
 140             Get_Continuation;
 141 
 142          --  Codes in the range 16#800# - 16#FFFF# are represented as
 143          --    1110yyyy 10yyyyxx 10xxxxxx
 144 
 145          elsif C <= 2#1110_1111# then
 146             R := Unsigned_32 (C and 2#0000_1111#);
 147             Get_Continuation;
 148             Get_Continuation;
 149 
 150          --  Codes in the range 16#10000# - 16#10FFFF# are represented as
 151          --    11110zzz 10zzyyyy 10yyyyxx 10xxxxxx
 152 
 153          elsif C <= 2#11110_111# then
 154             R := Unsigned_32 (C and 2#00000_111#);
 155             Get_Continuation;
 156             Get_Continuation;
 157             Get_Continuation;
 158 
 159          --  Any other code is an error
 160 
 161          else
 162             Raise_Encoding_Error (Iptr - 1);
 163          end if;
 164 
 165          Len := Len + 1;
 166          Result (Len) := Wide_Wide_Character'Val (R);
 167       end loop;
 168 
 169       return Result (1 .. Len);
 170    end Decode;
 171 
 172    --  Decode UTF-16 input to Wide_Wide_String
 173 
 174    function Decode (Item : UTF_16_Wide_String) return Wide_Wide_String is
 175       Result : Wide_Wide_String (1 .. Item'Length);
 176       --  Result cannot be longer than the input string
 177 
 178       Len : Natural := 0;
 179       --  Length of result
 180 
 181       Iptr : Natural;
 182       --  Pointer to next element in Item
 183 
 184       C : Unsigned_16;
 185       R : Unsigned_32;
 186 
 187    begin
 188       --  Skip UTF-16 BOM at start
 189 
 190       Iptr := Item'First;
 191 
 192       if Iptr <= Item'Last and then Item (Iptr) = BOM_16 (1) then
 193          Iptr := Iptr + 1;
 194       end if;
 195 
 196       --  Loop through input characters
 197 
 198       while Iptr <= Item'Last loop
 199          C := To_Unsigned_16 (Item (Iptr));
 200          Iptr := Iptr + 1;
 201 
 202          --  Codes in the range 16#0000#..16#D7FF# or 16#E000#..16#FFFD#
 203          --  represent their own value.
 204 
 205          if C <= 16#D7FF# or else C in 16#E000# .. 16#FFFD# then
 206             Len := Len + 1;
 207             Result (Len) := Wide_Wide_Character'Val (C);
 208 
 209          --  Codes in the range 16#D800#..16#DBFF# represent the first of the
 210          --  two surrogates used to encode the range 16#01_000#..16#10_FFFF".
 211          --  The first surrogate provides 10 high order bits of the result.
 212 
 213          elsif C <= 16#DBFF# then
 214             R := Shift_Left ((Unsigned_32 (C) - 16#D800#), 10);
 215 
 216             --  Error if at end of string
 217 
 218             if Iptr > Item'Last then
 219                Raise_Encoding_Error (Iptr - 1);
 220 
 221             --  Otherwise next character must be valid low order surrogate
 222             --  which provides the low 10 order bits of the result.
 223 
 224             else
 225                C := To_Unsigned_16 (Item (Iptr));
 226                Iptr := Iptr + 1;
 227 
 228                if C not in 16#DC00# .. 16#DFFF# then
 229                   Raise_Encoding_Error (Iptr - 1);
 230 
 231                else
 232                   R := R or (Unsigned_32 (C) mod 2 ** 10);
 233 
 234                --  The final adjustment is to add 16#01_0000 to get the
 235                --  result back in the required 21 bit range.
 236 
 237                   R := R + 16#01_0000#;
 238                   Len := Len + 1;
 239                   Result (Len) := Wide_Wide_Character'Val (R);
 240                end if;
 241             end if;
 242 
 243          --  Remaining codes are invalid
 244 
 245          else
 246             Raise_Encoding_Error (Iptr - 1);
 247          end if;
 248       end loop;
 249 
 250       return Result (1 .. Len);
 251    end Decode;
 252 
 253    ------------
 254    -- Encode --
 255    ------------
 256 
 257    --  Encode Wide_Wide_String in UTF-8, UTF-16BE or UTF-16LE
 258 
 259    function Encode
 260      (Item          : Wide_Wide_String;
 261       Output_Scheme : Encoding_Scheme;
 262       Output_BOM    : Boolean  := False) return UTF_String
 263    is
 264    begin
 265       if Output_Scheme = UTF_8 then
 266          return Encode (Item, Output_BOM);
 267       else
 268          return From_UTF_16 (Encode (Item), Output_Scheme, Output_BOM);
 269       end if;
 270    end Encode;
 271 
 272    --  Encode Wide_Wide_String in UTF-8
 273 
 274    function Encode
 275      (Item       : Wide_Wide_String;
 276       Output_BOM : Boolean  := False) return UTF_8_String
 277    is
 278       Result : String (1 .. 4 * Item'Length + 3);
 279       --  Worst case is four bytes per input byte + space for BOM
 280 
 281       Len  : Natural;
 282       --  Number of output codes stored in Result
 283 
 284       C : Unsigned_32;
 285       --  Single input character
 286 
 287       procedure Store (C : Unsigned_32);
 288       pragma Inline (Store);
 289       --  Store one output code (input is in range 0 .. 255)
 290 
 291       -----------
 292       -- Store --
 293       -----------
 294 
 295       procedure Store (C : Unsigned_32) is
 296       begin
 297          Len := Len + 1;
 298          Result (Len) := Character'Val (C);
 299       end Store;
 300 
 301    --  Start of processing for Encode
 302 
 303    begin
 304       --  Output BOM if required
 305 
 306       if Output_BOM then
 307          Result (1 .. 3) := BOM_8;
 308          Len := 3;
 309       else
 310          Len := 0;
 311       end if;
 312 
 313       --  Loop through characters of input
 314 
 315       for Iptr in Item'Range loop
 316          C := To_Unsigned_32 (Item (Iptr));
 317 
 318          --  Codes in the range 16#00#..16#7F# are represented as
 319          --    0xxxxxxx
 320 
 321          if C <= 16#7F# then
 322             Store (C);
 323 
 324          --  Codes in the range 16#80#..16#7FF# are represented as
 325          --    110yyyxx 10xxxxxx
 326 
 327          elsif C <= 16#7FF# then
 328             Store (2#110_00000# or Shift_Right (C, 6));
 329             Store (2#10_000000# or (C and 2#00_111111#));
 330 
 331          --  Codes in the range 16#800#..16#D7FF# or 16#E000#..16#FFFD# are
 332          --  represented as
 333          --    1110yyyy 10yyyyxx 10xxxxxx
 334 
 335          elsif C <= 16#D7FF# or else C in 16#E000# .. 16#FFFD# then
 336             Store (2#1110_0000# or Shift_Right (C, 12));
 337             Store (2#10_000000# or
 338                      Shift_Right (C and 2#111111_000000#, 6));
 339             Store (2#10_000000# or (C and 2#00_111111#));
 340 
 341          --  Codes in the range 16#10000# - 16#10FFFF# are represented as
 342          --    11110zzz 10zzyyyy 10yyyyxx 10xxxxxx
 343 
 344          elsif C in 16#1_0000# .. 16#10_FFFF# then
 345             Store (2#11110_000# or
 346                      Shift_Right (C, 18));
 347             Store (2#10_000000# or
 348                      Shift_Right (C and 2#111111_000000_000000#, 12));
 349             Store (2#10_000000# or
 350                      Shift_Right (C and 2#111111_000000#, 6));
 351             Store (2#10_000000# or
 352                      (C and 2#00_111111#));
 353 
 354          --  All other codes are invalid
 355 
 356          else
 357             Raise_Encoding_Error (Iptr);
 358          end if;
 359       end loop;
 360 
 361       return Result (1 .. Len);
 362    end Encode;
 363 
 364    --  Encode Wide_Wide_String in UTF-16
 365 
 366    function Encode
 367      (Item       : Wide_Wide_String;
 368       Output_BOM : Boolean  := False) return UTF_16_Wide_String
 369    is
 370       Result : UTF_16_Wide_String (1 .. 2 * Item'Length + 1);
 371       --  Worst case is each input character generates two output characters
 372       --  plus one for possible BOM.
 373 
 374       Len : Integer;
 375       --  Length of output string
 376 
 377       C : Unsigned_32;
 378 
 379    begin
 380       --  Output BOM if needed
 381 
 382       if Output_BOM then
 383          Result (1) := BOM_16 (1);
 384          Len := 1;
 385       else
 386          Len := 0;
 387       end if;
 388 
 389       --  Loop through input characters encoding them
 390 
 391       for Iptr in Item'Range loop
 392          C := To_Unsigned_32 (Item (Iptr));
 393 
 394          --  Codes in the range 16#00_0000#..16#00_D7FF# or 16#E000#..16#FFFD#
 395          --  are output unchanged
 396 
 397          if C <= 16#00_D7FF# or else C in 16#E000# .. 16#FFFD# then
 398             Len := Len + 1;
 399             Result (Len) := Wide_Character'Val (C);
 400 
 401          --  Codes in the range 16#01_0000#..16#10_FFFF# are output using two
 402          --  surrogate characters. First 16#1_0000# is subtracted from the code
 403          --  point to give a 20-bit value. This is then split into two separate
 404          --  10-bit values each of which is represented as a surrogate with the
 405          --  most significant half placed in the first surrogate. The ranges of
 406          --  values used for the two surrogates are 16#D800#-16#DBFF# for the
 407          --  first, most significant surrogate and 16#DC00#-16#DFFF# for the
 408          --  second, least significant surrogate.
 409 
 410          elsif C in 16#1_0000# ..  16#10_FFFF# then
 411             C := C - 16#1_0000#;
 412 
 413             Len := Len + 1;
 414             Result (Len) := Wide_Character'Val (16#D800# + C / 2 ** 10);
 415 
 416             Len := Len + 1;
 417             Result (Len) := Wide_Character'Val (16#DC00# + C mod 2 ** 10);
 418 
 419          --  All other codes are invalid
 420 
 421          else
 422             Raise_Encoding_Error (Iptr);
 423          end if;
 424       end loop;
 425 
 426       return Result (1 .. Len);
 427    end Encode;
 428 
 429 end Ada.Strings.UTF_Encoding.Wide_Wide_Strings;