76ce33da74
upgrade docs to quickbook 1.7.
485 lines
19 KiB
Plaintext
485 lines
19 KiB
Plaintext
[/
|
|
Copyright 2006-2007 John Maddock.
|
|
Distributed under the Boost Software License, Version 1.0.
|
|
(See accompanying file LICENSE_1_0.txt or copy at
|
|
http://www.boost.org/LICENSE_1_0.txt).
|
|
]
|
|
|
|
|
|
[section:icu Working With Unicode and ICU String Types]
|
|
|
|
[section:intro Introduction to using Regex with ICU]
|
|
|
|
The header:
|
|
|
|
<boost/regex/icu.hpp>
|
|
|
|
contains the data types and algorithms necessary for working with regular
|
|
expressions in a Unicode aware environment.
|
|
|
|
In order to use this header you will need the
|
|
[@http://www.ibm.com/software/globalization/icu/ ICU library], and you will need
|
|
to have built the Boost.Regex library with
|
|
[link boost_regex.install.building_with_unicode_and_icu_su ICU support enabled].
|
|
|
|
The header will enable you to:
|
|
|
|
* Create regular expressions that treat Unicode strings as sequences of UTF-32 code points.
|
|
* Create regular expressions that support various Unicode data properties, including character classification.
|
|
* Transparently search Unicode strings that are encoded as either UTF-8, UTF-16 or UTF-32.
|
|
|
|
[endsect]
|
|
|
|
[section:unicode_types Unicode regular expression types]
|
|
|
|
Header `<boost/regex/icu.hpp>` provides a regular expression traits class that
|
|
handles UTF-32 characters:
|
|
|
|
class icu_regex_traits;
|
|
|
|
and a regular expression type based upon that:
|
|
|
|
typedef basic_regex<UChar32,icu_regex_traits> u32regex;
|
|
|
|
The type `u32regex` is regular expression type to use for all Unicode
|
|
regular expressions; internally it uses UTF-32 code points, but can be
|
|
created from, and used to search, either UTF-8, or UTF-16 encoded strings
|
|
as well as UTF-32 ones.
|
|
|
|
The constructors, and assign member functions of `u32regex`, require UTF-32
|
|
encoded strings, but there are a series of overloaded algorithms called
|
|
`make_u32regex` which allow regular expressions to be created from
|
|
UTF-8, UTF-16, or UTF-32 encoded strings:
|
|
|
|
template <class InputIterator>
|
|
u32regex make_u32regex(InputIterator i,
|
|
InputIterator j,
|
|
boost::regex_constants::syntax_option_type opt);
|
|
|
|
[*Effects]: Creates a regular expression object from the iterator sequence \[i,j).
|
|
The character encoding of the sequence is determined based upon sizeof(*i):
|
|
1 implies UTF-8, 2 implies UTF-16, and 4 implies UTF-32.
|
|
|
|
u32regex make_u32regex(const char* p,
|
|
boost::regex_constants::syntax_option_type opt
|
|
= boost::regex_constants::perl);
|
|
|
|
[*Effects]: Creates a regular expression object from the Null-terminated
|
|
UTF-8 character sequence /p/.
|
|
|
|
u32regex make_u32regex(const unsigned char* p,
|
|
boost::regex_constants::syntax_option_type opt
|
|
= boost::regex_constants::perl);
|
|
|
|
[*Effects]: Creates a regular expression object from the Null-terminated UTF-8 character sequence p.
|
|
|
|
u32regex make_u32regex(const wchar_t* p,
|
|
boost::regex_constants::syntax_option_type opt
|
|
= boost::regex_constants::perl);
|
|
|
|
[*Effects]: Creates a regular expression object from the Null-terminated character sequence p. The character encoding of the sequence is determined based upon sizeof(wchar_t): 1 implies UTF-8, 2 implies UTF-16, and 4 implies UTF-32.
|
|
|
|
u32regex make_u32regex(const UChar* p,
|
|
boost::regex_constants::syntax_option_type opt
|
|
= boost::regex_constants::perl);
|
|
|
|
[*Effects]: Creates a regular expression object from the Null-terminated UTF-16 character sequence p.
|
|
|
|
template<class C, class T, class A>
|
|
u32regex make_u32regex(const std::basic_string<C, T, A>& s,
|
|
boost::regex_constants::syntax_option_type opt
|
|
= boost::regex_constants::perl);
|
|
|
|
[*Effects]: Creates a regular expression object from the string s. The character encoding of the string is determined based upon sizeof(C): 1 implies UTF-8, 2 implies UTF-16, and 4 implies UTF-32.
|
|
|
|
u32regex make_u32regex(const UnicodeString& s,
|
|
boost::regex_constants::syntax_option_type opt
|
|
= boost::regex_constants::perl);
|
|
|
|
[*Effects]: Creates a regular expression object from the UTF-16 encoding string s.
|
|
|
|
[endsect]
|
|
|
|
[section:unicode_algo Unicode Regular Expression Algorithms]
|
|
|
|
The regular expression algorithms [regex_match], [regex_search] and [regex_replace]
|
|
all expect that the character sequence upon which they operate,
|
|
is encoded in the same character encoding as the regular expression object
|
|
with which they are used. For Unicode regular expressions that behavior is
|
|
undesirable: while we may want to process the data in UTF-32 "chunks", the
|
|
actual data is much more likely to encoded as either UTF-8 or UTF-16.
|
|
Therefore the header <boost/regex/icu.hpp> provides a series of thin wrappers
|
|
around these algorithms, called `u32regex_match`, `u32regex_search`, and
|
|
`u32regex_replace`. These wrappers use iterator-adapters internally to
|
|
make external UTF-8 or UTF-16 data look as though it's really a UTF-32 sequence,
|
|
that can then be passed on to the "real" algorithm.
|
|
|
|
[h4 u32regex_match]
|
|
|
|
For each [regex_match] algorithm defined by `<boost/regex.hpp>`, then
|
|
`<boost/regex/icu.hpp>` defines an overloaded algorithm that takes the
|
|
same arguments, but which is called `u32regex_match`, and which will
|
|
accept UTF-8, UTF-16 or UTF-32 encoded data, as well as an
|
|
ICU UnicodeString as input.
|
|
|
|
Example: match a password, encoded in a UTF-16 UnicodeString:
|
|
|
|
//
|
|
// Find out if *password* meets our password requirements,
|
|
// as defined by the regular expression *requirements*.
|
|
//
|
|
bool is_valid_password(const UnicodeString& password, const UnicodeString& requirements)
|
|
{
|
|
return boost::u32regex_match(password, boost::make_u32regex(requirements));
|
|
}
|
|
|
|
Example: match a UTF-8 encoded filename:
|
|
|
|
//
|
|
// Extract filename part of a path from a UTF-8 encoded std::string and return the result
|
|
// as another std::string:
|
|
//
|
|
std::string get_filename(const std::string& path)
|
|
{
|
|
boost::u32regex r = boost::make_u32regex("(?:\\A|.*\\\\)([^\\\\]+)");
|
|
boost::smatch what;
|
|
if(boost::u32regex_match(path, what, r))
|
|
{
|
|
// extract $1 as a std::string:
|
|
return what.str(1);
|
|
}
|
|
else
|
|
{
|
|
throw std::runtime_error("Invalid pathname");
|
|
}
|
|
}
|
|
|
|
[h4 u32regex_search]
|
|
|
|
For each [regex_search] algorithm defined by `<boost/regex.hpp>`, then
|
|
`<boost/regex/icu.hpp>` defines an overloaded algorithm that takes the
|
|
same arguments, but which is called `u32regex_search`, and which will
|
|
accept UTF-8, UTF-16 or UTF-32 encoded data, as well as an ICU
|
|
UnicodeString as input.
|
|
|
|
Example: search for a character sequence in a specific language block:
|
|
|
|
UnicodeString extract_greek(const UnicodeString& text)
|
|
{
|
|
// searches through some UTF-16 encoded text for a block encoded in Greek,
|
|
// this expression is imperfect, but the best we can do for now - searching
|
|
// for specific scripts is actually pretty hard to do right.
|
|
//
|
|
// Here we search for a character sequence that begins with a Greek letter,
|
|
// and continues with characters that are either not-letters ( [^[:L*:]] )
|
|
// or are characters in the Greek character block ( [\\x{370}-\\x{3FF}] ).
|
|
//
|
|
boost::u32regex r = boost::make_u32regex(
|
|
L"[\\x{370}-\\x{3FF}](?:[^[:L*:]]|[\\x{370}-\\x{3FF}])*");
|
|
boost::u16match what;
|
|
if(boost::u32regex_search(text, what, r))
|
|
{
|
|
// extract $0 as a UnicodeString:
|
|
return UnicodeString(what[0].first, what.length(0));
|
|
}
|
|
else
|
|
{
|
|
throw std::runtime_error("No Greek found!");
|
|
}
|
|
}
|
|
|
|
[h4 u32regex_replace]
|
|
|
|
For each [regex_replace] algorithm defined by `<boost/regex.hpp>`, then
|
|
`<boost/regex/icu.hpp>` defines an overloaded algorithm that takes
|
|
the same arguments, but which is called `u32regex_replace`, and which will
|
|
accept UTF-8, UTF-16 or UTF-32 encoded data, as well as an ICU
|
|
UnicodeString as input. The input sequence and the format string specifier
|
|
passed to the algorithm, can be encoded independently (for example one can
|
|
be UTF-8, the other in UTF-16), but the result string / output iterator
|
|
argument must use the same character encoding as the text being searched.
|
|
|
|
Example: Credit card number reformatting:
|
|
|
|
//
|
|
// Take a credit card number as a string of digits,
|
|
// and reformat it as a human readable string with "-"
|
|
// separating each group of four digit;,
|
|
// note that we're mixing a UTF-32 regex, with a UTF-16
|
|
// string and a UTF-8 format specifier, and it still all
|
|
// just works:
|
|
//
|
|
const boost::u32regex e = boost::make_u32regex(
|
|
"\\A(\\d{3,4})[- ]?(\\d{4})[- ]?(\\d{4})[- ]?(\\d{4})\\z");
|
|
const char* human_format = "$1-$2-$3-$4";
|
|
|
|
UnicodeString human_readable_card_number(const UnicodeString& s)
|
|
{
|
|
return boost::u32regex_replace(s, e, human_format);
|
|
}
|
|
|
|
[endsect]
|
|
[section:unicode_iter Unicode Aware Regex Iterators]
|
|
|
|
[h4 u32regex_iterator]
|
|
|
|
Type `u32regex_iterator` is in all respects the same as [regex_iterator]
|
|
except that since the regular expression type is always `u32regex`
|
|
it only takes one template parameter (the iterator type). It also calls
|
|
`u32regex_search` internally, allowing it to interface correctly with
|
|
UTF-8, UTF-16, and UTF-32 data:
|
|
|
|
template <class BidirectionalIterator>
|
|
class u32regex_iterator
|
|
{
|
|
// for members see regex_iterator
|
|
};
|
|
|
|
typedef u32regex_iterator<const char*> utf8regex_iterator;
|
|
typedef u32regex_iterator<const UChar*> utf16regex_iterator;
|
|
typedef u32regex_iterator<const UChar32*> utf32regex_iterator;
|
|
|
|
In order to simplify the construction of a `u32regex_iterator` from a string,
|
|
there are a series of non-member helper functions called make_u32regex_iterator:
|
|
|
|
u32regex_iterator<const char*>
|
|
make_u32regex_iterator(const char* s,
|
|
const u32regex& e,
|
|
regex_constants::match_flag_type m = regex_constants::match_default);
|
|
|
|
u32regex_iterator<const wchar_t*>
|
|
make_u32regex_iterator(const wchar_t* s,
|
|
const u32regex& e,
|
|
regex_constants::match_flag_type m = regex_constants::match_default);
|
|
|
|
u32regex_iterator<const UChar*>
|
|
make_u32regex_iterator(const UChar* s,
|
|
const u32regex& e,
|
|
regex_constants::match_flag_type m = regex_constants::match_default);
|
|
|
|
template <class charT, class Traits, class Alloc>
|
|
u32regex_iterator<typename std::basic_string<charT, Traits, Alloc>::const_iterator>
|
|
make_u32regex_iterator(const std::basic_string<charT, Traits, Alloc>& s,
|
|
const u32regex& e,
|
|
regex_constants::match_flag_type m = regex_constants::match_default);
|
|
|
|
u32regex_iterator<const UChar*>
|
|
make_u32regex_iterator(const UnicodeString& s,
|
|
const u32regex& e,
|
|
regex_constants::match_flag_type m = regex_constants::match_default);
|
|
|
|
Each of these overloads returns an iterator that enumerates all occurrences
|
|
of expression /e/, in text /s/, using match_flags /m/.
|
|
|
|
Example: search for international currency symbols, along with their associated numeric value:
|
|
|
|
void enumerate_currencies(const std::string& text)
|
|
{
|
|
// enumerate and print all the currency symbols, along
|
|
// with any associated numeric values:
|
|
const char* re =
|
|
"([[:Sc:]][[:Cf:][:Cc:][:Z*:]]*)?"
|
|
"([[:Nd:]]+(?:[[:Po:]][[:Nd:]]+)?)?"
|
|
"(?(1)"
|
|
"|(?(2)"
|
|
"[[:Cf:][:Cc:][:Z*:]]*"
|
|
")"
|
|
"[[:Sc:]]"
|
|
")";
|
|
boost::u32regex r = boost::make_u32regex(re);
|
|
boost::u32regex_iterator<std::string::const_iterator>
|
|
i(boost::make_u32regex_iterator(text, r)), j;
|
|
while(i != j)
|
|
{
|
|
std::cout << (*i)[0] << std::endl;
|
|
++i;
|
|
}
|
|
}
|
|
|
|
Calling
|
|
|
|
[/this doesn't format correctly as code:]
|
|
[pre enumerate_currencies(" $100.23 or '''£'''198.12 ");]
|
|
|
|
Yields the output:
|
|
|
|
[pre
|
|
$100.23
|
|
'''£'''198.12
|
|
]
|
|
|
|
Provided of course that the input is encoded as UTF-8.
|
|
|
|
[h4 u32regex_token_iterator]
|
|
|
|
Type `u32regex_token_iterator` is in all respects the same as [regex_token_iterator]
|
|
except that since the regular expression type is always `u32regex` it only
|
|
takes one template parameter (the iterator type). It also calls
|
|
`u32regex_search` internally, allowing it to interface correctly with UTF-8,
|
|
UTF-16, and UTF-32 data:
|
|
|
|
template <class BidirectionalIterator>
|
|
class u32regex_token_iterator
|
|
{
|
|
// for members see regex_token_iterator
|
|
};
|
|
|
|
typedef u32regex_token_iterator<const char*> utf8regex_token_iterator;
|
|
typedef u32regex_token_iterator<const UChar*> utf16regex_token_iterator;
|
|
typedef u32regex_token_iterator<const UChar32*> utf32regex_token_iterator;
|
|
|
|
In order to simplify the construction of a `u32regex_token_iterator` from a string,
|
|
there are a series of non-member helper functions called `make_u32regex_token_iterator`:
|
|
|
|
u32regex_token_iterator<const char*>
|
|
make_u32regex_token_iterator(
|
|
const char* s,
|
|
const u32regex& e,
|
|
int sub,
|
|
regex_constants::match_flag_type m = regex_constants::match_default);
|
|
|
|
u32regex_token_iterator<const wchar_t*>
|
|
make_u32regex_token_iterator(
|
|
const wchar_t* s,
|
|
const u32regex& e,
|
|
int sub,
|
|
regex_constants::match_flag_type m = regex_constants::match_default);
|
|
|
|
u32regex_token_iterator<const UChar*>
|
|
make_u32regex_token_iterator(
|
|
const UChar* s,
|
|
const u32regex& e,
|
|
int sub,
|
|
regex_constants::match_flag_type m = regex_constants::match_default);
|
|
|
|
template <class charT, class Traits, class Alloc>
|
|
u32regex_token_iterator<typename std::basic_string<charT, Traits, Alloc>::const_iterator>
|
|
make_u32regex_token_iterator(
|
|
const std::basic_string<charT, Traits, Alloc>& s,
|
|
const u32regex& e,
|
|
int sub,
|
|
regex_constants::match_flag_type m = regex_constants::match_default);
|
|
|
|
u32regex_token_iterator<const UChar*>
|
|
make_u32regex_token_iterator(
|
|
const UnicodeString& s,
|
|
const u32regex& e,
|
|
int sub,
|
|
regex_constants::match_flag_type m = regex_constants::match_default);
|
|
|
|
Each of these overloads returns an iterator that enumerates all occurrences of
|
|
marked sub-expression sub in regular expression /e/, found in text /s/, using
|
|
match_flags /m/.
|
|
|
|
template <std::size_t N>
|
|
u32regex_token_iterator<const char*>
|
|
make_u32regex_token_iterator(
|
|
const char* p,
|
|
const u32regex& e,
|
|
const int (&submatch)[N],
|
|
regex_constants::match_flag_type m = regex_constants::match_default);
|
|
|
|
template <std::size_t N>
|
|
u32regex_token_iterator<const wchar_t*>
|
|
make_u32regex_token_iterator(
|
|
const wchar_t* p,
|
|
const u32regex& e,
|
|
const int (&submatch)[N],
|
|
regex_constants::match_flag_type m = regex_constants::match_default);
|
|
|
|
template <std::size_t N>
|
|
u32regex_token_iterator<const UChar*>
|
|
make_u32regex_token_iterator(
|
|
const UChar* p,
|
|
const u32regex& e,
|
|
const int (&submatch)[N],
|
|
regex_constants::match_flag_type m = regex_constants::match_default);
|
|
|
|
template <class charT, class Traits, class Alloc, std::size_t N>
|
|
u32regex_token_iterator<typename std::basic_string<charT, Traits, Alloc>::const_iterator>
|
|
make_u32regex_token_iterator(
|
|
const std::basic_string<charT, Traits, Alloc>& p,
|
|
const u32regex& e,
|
|
const int (&submatch)[N],
|
|
regex_constants::match_flag_type m = regex_constants::match_default);
|
|
|
|
template <std::size_t N>
|
|
u32regex_token_iterator<const UChar*>
|
|
make_u32regex_token_iterator(
|
|
const UnicodeString& s,
|
|
const u32regex& e,
|
|
const int (&submatch)[N],
|
|
regex_constants::match_flag_type m = regex_constants::match_default);
|
|
|
|
Each of these overloads returns an iterator that enumerates one sub-expression
|
|
for each submatch in regular expression /e/, found in text /s/, using match_flags /m/.
|
|
|
|
u32regex_token_iterator<const char*>
|
|
make_u32regex_token_iterator(
|
|
const char* p,
|
|
const u32regex& e,
|
|
const std::vector<int>& submatch,
|
|
regex_constants::match_flag_type m = regex_constants::match_default);
|
|
|
|
u32regex_token_iterator<const wchar_t*>
|
|
make_u32regex_token_iterator(
|
|
const wchar_t* p,
|
|
const u32regex& e,
|
|
const std::vector<int>& submatch,
|
|
regex_constants::match_flag_type m = regex_constants::match_default);
|
|
|
|
u32regex_token_iterator<const UChar*>
|
|
make_u32regex_token_iterator(
|
|
const UChar* p,
|
|
const u32regex& e,
|
|
const std::vector<int>& submatch,
|
|
regex_constants::match_flag_type m = regex_constants::match_default);
|
|
|
|
template <class charT, class Traits, class Alloc>
|
|
u32regex_token_iterator<typename std::basic_string<charT, Traits, Alloc>::const_iterator>
|
|
make_u32regex_token_iterator(
|
|
const std::basic_string<charT, Traits, Alloc>& p,
|
|
const u32regex& e,
|
|
const std::vector<int>& submatch,
|
|
regex_constants::match_flag_type m = regex_constants::match_default);
|
|
|
|
u32regex_token_iterator<const UChar*>
|
|
make_u32regex_token_iterator(
|
|
const UnicodeString& s,
|
|
const u32regex& e,
|
|
const std::vector<int>& submatch,
|
|
regex_constants::match_flag_type m = regex_constants::match_default);
|
|
|
|
Each of these overloads returns an iterator that enumerates one sub-expression for
|
|
each submatch in regular expression /e/, found in text /s/, using match_flags /m/.
|
|
|
|
Example: search for international currency symbols, along with their associated numeric value:
|
|
|
|
void enumerate_currencies2(const std::string& text)
|
|
{
|
|
// enumerate and print all the currency symbols, along
|
|
// with any associated numeric values:
|
|
const char* re =
|
|
"([[:Sc:]][[:Cf:][:Cc:][:Z*:]]*)?"
|
|
"([[:Nd:]]+(?:[[:Po:]][[:Nd:]]+)?)?"
|
|
"(?(1)"
|
|
"|(?(2)"
|
|
"[[:Cf:][:Cc:][:Z*:]]*"
|
|
")"
|
|
"[[:Sc:]]"
|
|
")";
|
|
boost::u32regex r = boost::make_u32regex(re);
|
|
boost::u32regex_token_iterator<std::string::const_iterator>
|
|
i(boost::make_u32regex_token_iterator(text, r, 1)), j;
|
|
while(i != j)
|
|
{
|
|
std::cout << *i << std::endl;
|
|
++i;
|
|
}
|
|
}
|
|
|
|
[endsect]
|
|
|
|
[endsect]
|
|
|