CXXR (C++ R)
String.h
Go to the documentation of this file.
1 /*CXXR $Id: String.h 1390 2013-06-11 14:41:41Z arr $
2  *CXXR
3  *CXXR This file is part of CXXR, a project to refactor the R interpreter
4  *CXXR into C++. It may consist in whole or in part of program code and
5  *CXXR documentation taken from the R project itself, incorporated into
6  *CXXR CXXR (and possibly MODIFIED) under the terms of the GNU General Public
7  *CXXR Licence.
8  *CXXR
9  *CXXR CXXR is Copyright (C) 2008-13 Andrew R. Runnalls, subject to such other
10  *CXXR copyrights and copyright restrictions as may be stated below.
11  *CXXR
12  *CXXR CXXR is not part of the R project, and bugs and other issues should
13  *CXXR not be reported via r-bugs or other R project channels; instead refer
14  *CXXR to the CXXR website.
15  *CXXR */
16 
17 /*
18  * R : A Computer Language for Statistical Data Analysis
19  * Copyright (C) 1995, 1996 Robert Gentleman and Ross Ihaka
20  * Copyright (C) 1999-2006 The R Development Core Team.
21  *
22  * This program is free software; you can redistribute it and/or modify
23  * it under the terms of the GNU General Public License as published by
24  * the Free Software Foundation; either version 2.1 of the License, or
25  * (at your option) any later version.
26  *
27  * This program is distributed in the hope that it will be useful,
28  * but WITHOUT ANY WARRANTY; without even the implied warranty of
29  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
30  * GNU Lesser General Public License for more details.
31  *
32  * You should have received a copy of the GNU General Public License
33  * along with this program; if not, a copy is available at
34  * http://www.r-project.org/Licenses/
35  */
36 
41 #ifndef CXXR_STRING_H
42 #define CXXR_STRING_H
43 
44 #include "CXXR/GCRoot.h"
45 #include "CXXR/VectorBase.h"
46 
47 typedef enum {
48  CE_NATIVE = 0,
49  CE_UTF8 = 1,
50  CE_LATIN1 = 2,
51  CE_BYTES = 3,
52  CE_SYMBOL = 5,
53  CE_ANY =99
54 } cetype_t;
55 
56 #ifdef __cplusplus
57 
58 #include <boost/serialization/nvp.hpp>
59 #include <tr1/unordered_map>
60 #include <string>
61 
62 #include "CXXR/Allocator.hpp"
63 #include "CXXR/SEXP_downcast.hpp"
64 #include "CXXR/SchwarzCounter.hpp"
65 
66 namespace CXXR {
78  class String : public VectorBase {
79  public:
85  class Comparator {
86  public:
93  explicit Comparator(bool na_last = true)
94  : m_na_last(na_last)
95  {}
96 
105  bool operator()(const String* l, const String* r) const;
106  private:
107  bool m_na_last;
108  };
109 
119  char operator[](unsigned int index) const
120  {
121  return m_string->c_str()[index];
122  }
123 
128  static String* blank()
129  {
130  return s_blank;
131  }
132 
138  const char* c_str() const
139  {
140  return m_string->c_str();
141  }
142 
148  cetype_t encoding() const
149  {
150  return m_encoding;
151  }
152 
165  static cetype_t GPBits2Encoding(unsigned int gpbits);
166 
171  bool isASCII() const
172  {
173  return m_ascii;
174  }
175 
180  bool isNA() const
181  {
182  return this == s_na;
183  }
184 
197  static String* NA()
198  {
199  return s_na;
200  }
201 
222  static String* obtain(const std::string& str,
223  cetype_t encoding = CE_NATIVE);
224 
229  static const char* staticTypeName()
230  {
231  return "char";
232  }
233 
238  const std::string& stdstring() const
239  {
240  return *m_string;
241  }
242 
243  // Virtual functions of RObject:
244  unsigned int packGPBits() const;
245  const char* typeName() const;
246  private:
247  friend class boost::serialization::access;
248  friend class SchwarzCounter<String>;
249  friend class Symbol;
250 
251  // The first element of the key is the text, the second
252  // element the encoding:
253  typedef std::pair<std::string, cetype_t> key;
254 
255  // Hashing is based simply on the text of the key, not on its
256  // encoding:
257  class Hasher : public std::unary_function<key, std::size_t> {
258  public:
259  std::size_t operator()(const key& k) const
260  {
261  return s_string_hasher(k.first);
262  }
263  private:
264  static std::tr1::hash<std::string> s_string_hasher;
265  };
266 
267  // The cache is implemented as a mapping from keys to pointers
268  // to String objects. Each String simply contains a pointer
269  // locating its entry within the cache.
270  typedef
271  std::tr1::unordered_map<key, String*, Hasher, std::equal_to<key>,
272  CXXR::Allocator<std::pair<const key,
273  String*> >
274  > map;
275 
276  static map* s_cache;
277  static std::string* s_na_string;
278  static String* s_na;
279  static String* s_blank;
280 
281  map::value_type* m_key_val_pr;
282  const std::string* m_string;
283  cetype_t m_encoding;
284  mutable Symbol* m_symbol; // Pointer to the Symbol object identified
285  // by this String, or a null pointer if none.
286  bool m_ascii;
287 
288  // A null value of key_val_pr is used to designate the NA string:
289  explicit String(map::value_type* key_val_pr = 0);
290 
291  // Not implemented. Declared to prevent
292  // compiler-generated versions:
293  String(const String&);
294  String& operator=(const String&);
295 
296  // Declared private to ensure that String objects are
297  // allocated only using 'new'.
298  ~String();
299 
300  static void cleanup();
301 
302  // Initialize the static data members:
303  static void initialize();
304 
305  template<class Archive>
306  void load(Archive & ar, const unsigned int version);
307 
308  template<class Archive>
309  void save(Archive & ar, const unsigned int version) const;
310 
311  // Fields not serialised here are set up by the constructor:
312  template <class Archive>
313  void serialize(Archive& ar, const unsigned int version) {
314  boost::serialization::split_member(ar, *this, version);
315  }
316  };
317 
326  bool isASCII(const std::string& str);
327 } // namespace CXXR
328 
329 BOOST_CLASS_EXPORT_KEY(CXXR::String)
330 
331 namespace {
332  CXXR::SchwarzCounter<CXXR::String> string_schwarz_ctr;
333 }
334 
335 // ***** Implementation of non-inlined templated members *****
336 
337 template<class Archive>
338 void CXXR::String::load(Archive& ar, const unsigned int version)
339 {
340  // This will only ever be applied to a 'temporary' String
341  // created by the default constructor.
342  ar & BOOST_SERIALIZATION_BASE_OBJECT_NVP(RObject);
343  bool isna;
344  ar >> BOOST_SERIALIZATION_NVP(isna);
345  if (isna)
347  else {
348  std::string str;
349  ar >> boost::serialization::make_nvp("string", str);
350  ar >> BOOST_SERIALIZATION_NVP(m_encoding);
351  S11nScope::defineRelocation(this, obtain(str, m_encoding));
352  }
353 }
354 
355 template<class Archive>
356 void CXXR::String::save(Archive& ar, const unsigned int version) const
357 {
358  ar & BOOST_SERIALIZATION_BASE_OBJECT_NVP(RObject);
359  bool isna = (this == NA());
360  ar << BOOST_SERIALIZATION_NVP(isna);
361  if (!isna) {
362  std::string str = stdstring();
363  ar << boost::serialization::make_nvp("string", str);
364  ar << BOOST_SERIALIZATION_NVP(m_encoding);
365  }
366 }
367 
368 extern "C" {
369 
370 #endif /* __cplusplus */
371 
372  extern SEXP R_NaString;
373  extern SEXP R_BlankString;
374 
382 #ifndef __cplusplus
383  int ENC_KNOWN(SEXP x);
384 #else
385  inline int ENC_KNOWN(SEXP x)
386  {
387  // Use explicit namespace qualification to prevent ambiguities:
388  const CXXR::String& str = *CXXR::SEXP_downcast<const CXXR::String*>(x);
389  cetype_t enc = str.encoding();
390  return enc == CE_LATIN1 || enc == CE_UTF8;
391  }
392 #endif
393 
400 #ifndef __cplusplus
401  int IS_ASCII(SEXP x);
402 #else
403  inline int IS_ASCII(SEXP x)
404  {
405  // Use explicit namespace qualification to prevent ambiguities:
406  const CXXR::String& str = *CXXR::SEXP_downcast<const CXXR::String*>(x);
407  return Rboolean(str.isASCII());
408  }
409 #endif
410 
417 #ifndef __cplusplus
418  int IS_BYTES(SEXP x);
419 #else
420  inline int IS_BYTES(SEXP x)
421  {
422  // Use explicit namespace qualification to prevent ambiguities:
423  const CXXR::String& str = *CXXR::SEXP_downcast<const CXXR::String*>(x);
424  return Rboolean(str.encoding() == CE_BYTES);
425  }
426 #endif
427 
434 #ifndef __cplusplus
435  Rboolean IS_LATIN1(SEXP x);
436 #else
437  inline Rboolean IS_LATIN1(SEXP x)
438  {
439  // Use explicit namespace qualification to prevent ambiguities:
440  const CXXR::String& str = *CXXR::SEXP_downcast<const CXXR::String*>(x);
441  return Rboolean(str.encoding() == CE_LATIN1);
442  }
443 #endif
444 
451 #ifndef __cplusplus
452  Rboolean IS_UTF8(SEXP x);
453 #else
454  inline Rboolean IS_UTF8(SEXP x)
455  {
456  // Use explicit namespace qualification to prevent ambiguities:
457  const CXXR::String& str = *CXXR::SEXP_downcast<const CXXR::String*>(x);
458  return Rboolean(str.encoding() == CE_UTF8);
459  }
460 #endif
461 
468 #ifndef __cplusplus
469  const char *R_CHAR(SEXP x);
470 #else
471  inline const char *R_CHAR(SEXP x)
472  {
473  using namespace CXXR;
474  return SEXP_downcast<String*>(x, false)->stdstring().c_str();
475  }
476 #endif
477 
490 #ifndef __cplusplus
491  SEXP Rf_mkChar(const char* str);
492 #else
493  inline SEXP Rf_mkChar(const char* str)
494  {
495  return CXXR::String::obtain(str);
496  }
497 #endif
498 
514 #ifndef __cplusplus
515  SEXP Rf_mkCharCE(const char* str, cetype_t encoding);
516 #else
517  inline SEXP Rf_mkCharCE(const char* str, cetype_t encoding)
518  {
519  return CXXR::String::obtain(str, encoding);
520  }
521 #endif
522 
544  SEXP Rf_mkCharLenCE(const char* text, int length, cetype_t encoding);
545 
563 #ifndef __cplusplus
564  SEXP Rf_mkCharLen(const char* text, int length);
565 #else
566  inline SEXP Rf_mkCharLen(const char* text, int length)
567  {
568  return Rf_mkCharLenCE(text, length, CE_NATIVE);
569  }
570 #endif
571 
582  const char* Rf_translateCharUTF8(SEXP x);
583 
584 #ifdef __cplusplus
585 } // extern "C"
586 #endif
587 
588 #endif /* CXXR_STRING_H */