// Copyright 2008 The RE2 Authors. All Rights Reserved. // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file. #ifndef RE2_UNICODE_GROUPS_H_ #define RE2_UNICODE_GROUPS_H_ // Unicode character groups. // The codes get split into ranges of 16-bit codes // and ranges of 32-bit codes. It would be simpler // to use only 32-bit ranges, but these tables are large // enough to warrant extra care. // // Using just 32-bit ranges gives 27 kB of data. // Adding 16-bit ranges gives 18 kB of data. // Adding an extra table of 16-bit singletons would reduce // to 16.5 kB of data but make the data harder to use; // we don't bother. #include #include "util/util.h" #include "util/utf.h" namespace re2 { struct URange16 { uint16_t lo; uint16_t hi; }; struct URange32 { Rune lo; Rune hi; }; struct UGroup { const char *name; int sign; // +1 for [abc], -1 for [^abc] const URange16 *r16; int nr16; const URange32 *r32; int nr32; }; // Named by property or script name (e.g., "Nd", "N", "Han"). // Negated groups are not included. extern const UGroup unicode_groups[]; extern const int num_unicode_groups; // Named by POSIX name (e.g., "[:alpha:]", "[:^lower:]"). // Negated groups are included. extern const UGroup posix_groups[]; extern const int num_posix_groups; // Named by Perl name (e.g., "\\d", "\\D"). // Negated groups are included. extern const UGroup perl_groups[]; extern const int num_perl_groups; } // namespace re2 #endif // RE2_UNICODE_GROUPS_H_