codecvt_specializations.h

Go to the documentation of this file.
00001 // Locale support (codecvt) -*- C++ -*-
00002 
00003 // Copyright (C) 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007
00004 //  Free Software Foundation, Inc.
00005 //
00006 // This file is part of the GNU ISO C++ Library.  This library is free
00007 // software; you can redistribute it and/or modify it under the
00008 // terms of the GNU General Public License as published by the
00009 // Free Software Foundation; either version 2, or (at your option)
00010 // any later version.
00011 
00012 // This library is distributed in the hope that it will be useful,
00013 // but WITHOUT ANY WARRANTY; without even the implied warranty of
00014 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
00015 // GNU General Public License for more details.
00016 
00017 // You should have received a copy of the GNU General Public License along
00018 // with this library; see the file COPYING.  If not, write to the Free
00019 // Software Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301,
00020 // USA.
00021 
00022 // As a special exception, you may use this file as part of a free software
00023 // library without restriction.  Specifically, if other files instantiate
00024 // templates or use macros or inline functions from this file, or you compile
00025 // this file and link it with other files to produce an executable, this
00026 // file does not by itself cause the resulting executable to be covered by
00027 // the GNU General Public License.  This exception does not however
00028 // invalidate any other reasons why the executable file might be covered by
00029 // the GNU General Public License.
00030 
00031 //
00032 // ISO C++ 14882: 22.2.1.5 Template class codecvt
00033 //
00034 
00035 // Written by Benjamin Kosnik <bkoz@redhat.com>
00036 
00037 /** @file ext/codecvt_specializations.h
00038  *  This file is a GNU extension to the Standard C++ Library.
00039  */
00040 
00041 #ifndef _EXT_CODECVT_SPECIALIZATIONS_H
00042 #define _EXT_CODECVT_SPECIALIZATIONS_H 1
00043 
00044 #include <bits/c++config.h>
00045 #include <locale>
00046 #include <iconv.h>
00047 
00048 _GLIBCXX_BEGIN_NAMESPACE(__gnu_cxx)
00049 
00050   /// Extension to use iconv for dealing with character encodings.
00051   // This includes conversions and comparisons between various character
00052   // sets.  This object encapsulates data that may need to be shared between
00053   // char_traits, codecvt and ctype.
00054   class encoding_state
00055   {
00056   public:
00057     // Types: 
00058     // NB: A conversion descriptor subsumes and enhances the
00059     // functionality of a simple state type such as mbstate_t.
00060     typedef iconv_t descriptor_type;
00061     
00062   protected:
00063     // Name of internal character set encoding.
00064     std::string         _M_int_enc;
00065 
00066     // Name of external character set encoding.
00067     std::string     _M_ext_enc;
00068 
00069     // Conversion descriptor between external encoding to internal encoding.
00070     descriptor_type _M_in_desc;
00071 
00072     // Conversion descriptor between internal encoding to external encoding.
00073     descriptor_type _M_out_desc;
00074 
00075     // The byte-order marker for the external encoding, if necessary.
00076     int         _M_ext_bom;
00077 
00078     // The byte-order marker for the internal encoding, if necessary.
00079     int         _M_int_bom;
00080 
00081     // Number of external bytes needed to construct one complete
00082     // character in the internal encoding.
00083     // NB: -1 indicates variable, or stateful, encodings.
00084     int         _M_bytes;
00085 
00086   public:
00087     explicit 
00088     encoding_state() 
00089     : _M_in_desc(0), _M_out_desc(0), _M_ext_bom(0), _M_int_bom(0), _M_bytes(0)
00090     { }
00091 
00092     explicit 
00093     encoding_state(const char* __int, const char* __ext, 
00094            int __ibom = 0, int __ebom = 0, int __bytes = 1)
00095     : _M_int_enc(__int), _M_ext_enc(__ext), _M_in_desc(0), _M_out_desc(0), 
00096       _M_ext_bom(__ebom), _M_int_bom(__ibom), _M_bytes(__bytes)
00097     { init(); }
00098 
00099     // 21.1.2 traits typedefs
00100     // p4
00101     // typedef STATE_T state_type
00102     // requires: state_type shall meet the requirements of
00103     // CopyConstructible types (20.1.3)
00104     // NB: This does not preserve the actual state of the conversion
00105     // descriptor member, but it does duplicate the encoding
00106     // information.
00107     encoding_state(const encoding_state& __obj) : _M_in_desc(0), _M_out_desc(0)
00108     { construct(__obj); }
00109 
00110     // Need assignment operator as well.
00111     encoding_state&
00112     operator=(const encoding_state& __obj)
00113     {
00114       construct(__obj);
00115       return *this;
00116     }
00117 
00118     ~encoding_state()
00119     { destroy(); } 
00120 
00121     bool
00122     good() const throw()
00123     { 
00124       const descriptor_type __err = (iconv_t)(-1);
00125       bool __test = _M_in_desc && _M_in_desc != __err; 
00126       __test &=  _M_out_desc && _M_out_desc != __err;
00127       return __test;
00128     }
00129     
00130     int
00131     character_ratio() const
00132     { return _M_bytes; }
00133 
00134     const std::string
00135     internal_encoding() const
00136     { return _M_int_enc; }
00137 
00138     int 
00139     internal_bom() const
00140     { return _M_int_bom; }
00141 
00142     const std::string
00143     external_encoding() const
00144     { return _M_ext_enc; }
00145 
00146     int 
00147     external_bom() const
00148     { return _M_ext_bom; }
00149 
00150     const descriptor_type&
00151     in_descriptor() const
00152     { return _M_in_desc; }
00153 
00154     const descriptor_type&
00155     out_descriptor() const
00156     { return _M_out_desc; }
00157 
00158   protected:
00159     void
00160     init()
00161     {
00162       const descriptor_type __err = (iconv_t)(-1);
00163       const bool __have_encodings = _M_int_enc.size() && _M_ext_enc.size();
00164       if (!_M_in_desc && __have_encodings)
00165     {
00166       _M_in_desc = iconv_open(_M_int_enc.c_str(), _M_ext_enc.c_str());
00167       if (_M_in_desc == __err)
00168         std::__throw_runtime_error(__N("encoding_state::_M_init "
00169                     "creating iconv input descriptor failed"));
00170     }
00171       if (!_M_out_desc && __have_encodings)
00172     {
00173       _M_out_desc = iconv_open(_M_ext_enc.c_str(), _M_int_enc.c_str());
00174       if (_M_out_desc == __err)
00175         std::__throw_runtime_error(__N("encoding_state::_M_init "
00176                   "creating iconv output descriptor failed"));
00177     }
00178     }
00179 
00180     void
00181     construct(const encoding_state& __obj)
00182     {
00183       destroy();
00184       _M_int_enc = __obj._M_int_enc;
00185       _M_ext_enc = __obj._M_ext_enc;
00186       _M_ext_bom = __obj._M_ext_bom;
00187       _M_int_bom = __obj._M_int_bom;
00188       _M_bytes = __obj._M_bytes;
00189       init();
00190     }
00191 
00192     void
00193     destroy() throw()
00194     {
00195       const descriptor_type __err = (iconv_t)(-1);
00196       if (_M_in_desc && _M_in_desc != __err) 
00197     {
00198       iconv_close(_M_in_desc);
00199       _M_in_desc = 0;
00200     }
00201       if (_M_out_desc && _M_out_desc != __err) 
00202     {
00203       iconv_close(_M_out_desc);
00204       _M_out_desc = 0;
00205     }
00206     }
00207   };
00208 
00209   /// encoding_char_traits
00210   // Custom traits type with encoding_state for the state type, and the
00211   // associated fpos<encoding_state> for the position type, all other
00212   // bits equivalent to the required char_traits instantiations.
00213   template<typename _CharT>
00214     struct encoding_char_traits : public std::char_traits<_CharT>
00215     {
00216       typedef encoding_state                state_type;
00217       typedef typename std::fpos<state_type>        pos_type;
00218     };
00219 
00220 _GLIBCXX_END_NAMESPACE
00221 
00222 
00223 _GLIBCXX_BEGIN_NAMESPACE(std)
00224 
00225   using __gnu_cxx::encoding_state;
00226 
00227   /// codecvt<InternT, _ExternT, encoding_state> specialization.
00228   // This partial specialization takes advantage of iconv to provide
00229   // code conversions between a large number of character encodings.
00230   template<typename _InternT, typename _ExternT>
00231     class codecvt<_InternT, _ExternT, encoding_state>
00232     : public __codecvt_abstract_base<_InternT, _ExternT, encoding_state>
00233     {
00234     public:      
00235       // Types:
00236       typedef codecvt_base::result          result;
00237       typedef _InternT                  intern_type;
00238       typedef _ExternT                  extern_type;
00239       typedef __gnu_cxx::encoding_state         state_type;
00240       typedef state_type::descriptor_type       descriptor_type;
00241 
00242       // Data Members:
00243       static locale::id         id;
00244 
00245       explicit 
00246       codecvt(size_t __refs = 0)
00247       : __codecvt_abstract_base<intern_type, extern_type, state_type>(__refs)
00248       { }
00249 
00250       explicit 
00251       codecvt(state_type& __enc, size_t __refs = 0)
00252       : __codecvt_abstract_base<intern_type, extern_type, state_type>(__refs)
00253       { }
00254 
00255      protected:
00256       virtual 
00257       ~codecvt() { }
00258 
00259       virtual result
00260       do_out(state_type& __state, const intern_type* __from, 
00261          const intern_type* __from_end, const intern_type*& __from_next,
00262          extern_type* __to, extern_type* __to_end,
00263          extern_type*& __to_next) const;
00264 
00265       virtual result
00266       do_unshift(state_type& __state, extern_type* __to, 
00267          extern_type* __to_end, extern_type*& __to_next) const;
00268 
00269       virtual result
00270       do_in(state_type& __state, const extern_type* __from, 
00271         const extern_type* __from_end, const extern_type*& __from_next,
00272         intern_type* __to, intern_type* __to_end, 
00273         intern_type*& __to_next) const;
00274 
00275       virtual int 
00276       do_encoding() const throw();
00277 
00278       virtual bool 
00279       do_always_noconv() const throw();
00280 
00281       virtual int 
00282       do_length(state_type&, const extern_type* __from, 
00283         const extern_type* __end, size_t __max) const;
00284 
00285       virtual int 
00286       do_max_length() const throw();
00287     };
00288 
00289   template<typename _InternT, typename _ExternT>
00290     locale::id 
00291     codecvt<_InternT, _ExternT, encoding_state>::id;
00292 
00293   // This adaptor works around the signature problems of the second
00294   // argument to iconv():  SUSv2 and others use 'const char**', but glibc 2.2
00295   // uses 'char**', which matches the POSIX 1003.1-2001 standard.
00296   // Using this adaptor, g++ will do the work for us.
00297   template<typename _Tp>
00298     inline size_t
00299     __iconv_adaptor(size_t(*__func)(iconv_t, _Tp, size_t*, char**, size_t*),
00300                     iconv_t __cd, char** __inbuf, size_t* __inbytes,
00301                     char** __outbuf, size_t* __outbytes)
00302     { return __func(__cd, (_Tp)__inbuf, __inbytes, __outbuf, __outbytes); }
00303 
00304   template<typename _InternT, typename _ExternT>
00305     codecvt_base::result
00306     codecvt<_InternT, _ExternT, encoding_state>::
00307     do_out(state_type& __state, const intern_type* __from, 
00308        const intern_type* __from_end, const intern_type*& __from_next,
00309        extern_type* __to, extern_type* __to_end,
00310        extern_type*& __to_next) const
00311     {
00312       result __ret = codecvt_base::error;
00313       if (__state.good())
00314     {
00315       const descriptor_type& __desc = __state.out_descriptor();
00316       const size_t __fmultiple = sizeof(intern_type);
00317       size_t __fbytes = __fmultiple * (__from_end - __from);
00318       const size_t __tmultiple = sizeof(extern_type);
00319       size_t __tbytes = __tmultiple * (__to_end - __to); 
00320       
00321       // Argument list for iconv specifies a byte sequence. Thus,
00322       // all to/from arrays must be brutally casted to char*.
00323       char* __cto = reinterpret_cast<char*>(__to);
00324       char* __cfrom;
00325       size_t __conv;
00326 
00327       // Some encodings need a byte order marker as the first item
00328       // in the byte stream, to designate endian-ness. The default
00329       // value for the byte order marker is NULL, so if this is
00330       // the case, it's not necessary and we can just go on our
00331       // merry way.
00332       int __int_bom = __state.internal_bom();
00333       if (__int_bom)
00334         {     
00335           size_t __size = __from_end - __from;
00336           intern_type* __cfixed = static_cast<intern_type*>
00337         (__builtin_alloca(sizeof(intern_type) * (__size + 1)));
00338           __cfixed[0] = static_cast<intern_type>(__int_bom);
00339           char_traits<intern_type>::copy(__cfixed + 1, __from, __size);
00340           __cfrom = reinterpret_cast<char*>(__cfixed);
00341           __conv = __iconv_adaptor(iconv, __desc, &__cfrom,
00342                                         &__fbytes, &__cto, &__tbytes); 
00343         }
00344       else
00345         {
00346           intern_type* __cfixed = const_cast<intern_type*>(__from);
00347           __cfrom = reinterpret_cast<char*>(__cfixed);
00348           __conv = __iconv_adaptor(iconv, __desc, &__cfrom, &__fbytes, 
00349                        &__cto, &__tbytes); 
00350         }
00351 
00352       if (__conv != size_t(-1))
00353         {
00354           __from_next = reinterpret_cast<const intern_type*>(__cfrom);
00355           __to_next = reinterpret_cast<extern_type*>(__cto);
00356           __ret = codecvt_base::ok;
00357         }
00358       else 
00359         {
00360           if (__fbytes < __fmultiple * (__from_end - __from))
00361         {
00362           __from_next = reinterpret_cast<const intern_type*>(__cfrom);
00363           __to_next = reinterpret_cast<extern_type*>(__cto);
00364           __ret = codecvt_base::partial;
00365         }
00366           else
00367         __ret = codecvt_base::error;
00368         }
00369     }
00370       return __ret; 
00371     }
00372 
00373   template<typename _InternT, typename _ExternT>
00374     codecvt_base::result
00375     codecvt<_InternT, _ExternT, encoding_state>::
00376     do_unshift(state_type& __state, extern_type* __to, 
00377            extern_type* __to_end, extern_type*& __to_next) const
00378     {
00379       result __ret = codecvt_base::error;
00380       if (__state.good())
00381     {
00382       const descriptor_type& __desc = __state.in_descriptor();
00383       const size_t __tmultiple = sizeof(intern_type);
00384       size_t __tlen = __tmultiple * (__to_end - __to); 
00385       
00386       // Argument list for iconv specifies a byte sequence. Thus,
00387       // all to/from arrays must be brutally casted to char*.
00388       char* __cto = reinterpret_cast<char*>(__to);
00389       size_t __conv = __iconv_adaptor(iconv,__desc, NULL, NULL,
00390                                           &__cto, &__tlen); 
00391       
00392       if (__conv != size_t(-1))
00393         {
00394           __to_next = reinterpret_cast<extern_type*>(__cto);
00395           if (__tlen == __tmultiple * (__to_end - __to))
00396         __ret = codecvt_base::noconv;
00397           else if (__tlen == 0)
00398         __ret = codecvt_base::ok;
00399           else
00400         __ret = codecvt_base::partial;
00401         }
00402       else 
00403         __ret = codecvt_base::error;
00404     }
00405       return __ret; 
00406     }
00407    
00408   template<typename _InternT, typename _ExternT>
00409     codecvt_base::result
00410     codecvt<_InternT, _ExternT, encoding_state>::
00411     do_in(state_type& __state, const extern_type* __from, 
00412       const extern_type* __from_end, const extern_type*& __from_next,
00413       intern_type* __to, intern_type* __to_end, 
00414       intern_type*& __to_next) const
00415     { 
00416       result __ret = codecvt_base::error;
00417       if (__state.good())
00418     {
00419       const descriptor_type& __desc = __state.in_descriptor();
00420       const size_t __fmultiple = sizeof(extern_type);
00421       size_t __flen = __fmultiple * (__from_end - __from);
00422       const size_t __tmultiple = sizeof(intern_type);
00423       size_t __tlen = __tmultiple * (__to_end - __to); 
00424       
00425       // Argument list for iconv specifies a byte sequence. Thus,
00426       // all to/from arrays must be brutally casted to char*.
00427       char* __cto = reinterpret_cast<char*>(__to);
00428       char* __cfrom;
00429       size_t __conv;
00430 
00431       // Some encodings need a byte order marker as the first item
00432       // in the byte stream, to designate endian-ness. The default
00433       // value for the byte order marker is NULL, so if this is
00434       // the case, it's not necessary and we can just go on our
00435       // merry way.
00436       int __ext_bom = __state.external_bom();
00437       if (__ext_bom)
00438         {     
00439           size_t __size = __from_end - __from;
00440           extern_type* __cfixed =  static_cast<extern_type*>
00441         (__builtin_alloca(sizeof(extern_type) * (__size + 1)));
00442           __cfixed[0] = static_cast<extern_type>(__ext_bom);
00443           char_traits<extern_type>::copy(__cfixed + 1, __from, __size);
00444           __cfrom = reinterpret_cast<char*>(__cfixed);
00445           __conv = __iconv_adaptor(iconv, __desc, &__cfrom,
00446                                        &__flen, &__cto, &__tlen); 
00447         }
00448       else
00449         {
00450           extern_type* __cfixed = const_cast<extern_type*>(__from);
00451           __cfrom = reinterpret_cast<char*>(__cfixed);
00452           __conv = __iconv_adaptor(iconv, __desc, &__cfrom,
00453                                        &__flen, &__cto, &__tlen); 
00454         }
00455 
00456       
00457       if (__conv != size_t(-1))
00458         {
00459           __from_next = reinterpret_cast<const extern_type*>(__cfrom);
00460           __to_next = reinterpret_cast<intern_type*>(__cto);
00461           __ret = codecvt_base::ok;
00462         }
00463       else 
00464         {
00465           if (__flen < static_cast<size_t>(__from_end - __from))
00466         {
00467           __from_next = reinterpret_cast<const extern_type*>(__cfrom);
00468           __to_next = reinterpret_cast<intern_type*>(__cto);
00469           __ret = codecvt_base::partial;
00470         }
00471           else
00472         __ret = codecvt_base::error;
00473         }
00474     }
00475       return __ret; 
00476     }
00477   
00478   template<typename _InternT, typename _ExternT>
00479     int 
00480     codecvt<_InternT, _ExternT, encoding_state>::
00481     do_encoding() const throw()
00482     {
00483       int __ret = 0;
00484       if (sizeof(_ExternT) <= sizeof(_InternT))
00485     __ret = sizeof(_InternT) / sizeof(_ExternT);
00486       return __ret; 
00487     }
00488   
00489   template<typename _InternT, typename _ExternT>
00490     bool 
00491     codecvt<_InternT, _ExternT, encoding_state>::
00492     do_always_noconv() const throw()
00493     { return false; }
00494   
00495   template<typename _InternT, typename _ExternT>
00496     int 
00497     codecvt<_InternT, _ExternT, encoding_state>::
00498     do_length(state_type&, const extern_type* __from, 
00499           const extern_type* __end, size_t __max) const
00500     { return std::min(__max, static_cast<size_t>(__end - __from)); }
00501 
00502   // _GLIBCXX_RESOLVE_LIB_DEFECTS
00503   // 74.  Garbled text for codecvt::do_max_length
00504   template<typename _InternT, typename _ExternT>
00505     int 
00506     codecvt<_InternT, _ExternT, encoding_state>::
00507     do_max_length() const throw()
00508     { return 1; }
00509 
00510 _GLIBCXX_END_NAMESPACE
00511 
00512 #endif

Generated on Wed Dec 31 12:48:54 2008 for libstdc++ by  doxygen 1.5.6