// Copyright 2013 Google Inc. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // // State Table follower for scanning UTF-8 strings without converting to // 32- or 16-bit Unicode values. // // Author: dsites@google.com (Dick Sites) // #ifndef UTIL_UTF8_UTF8STATETABLE_H_ #define UTIL_UTF8_UTF8STATETABLE_H_ #include #include "integral_types.h" // for uint8, uint32, uint16 #include "stringpiece.h" namespace CLD2 { class OffsetMap; // These four-byte entries compactly encode how many bytes 0..255 to delete // in making a string replacement, how many bytes to add 0..255, and the offset // 0..64k-1 of the replacement string in remap_string. struct RemapEntry { uint8 delete_bytes; uint8 add_bytes; uint16 bytes_offset; }; // Exit type codes for state tables. All but the first get stuffed into // signed one-byte entries. The first is only generated by executable code. // To distinguish from next-state entries, these must be contiguous and // all <= kExitNone typedef enum { kExitDstSpaceFull = 239, kExitIllegalStructure, // 240 kExitOK, // 241 kExitReject, // ... kExitReplace1, kExitReplace2, kExitReplace3, kExitReplace21, kExitReplace31, kExitReplace32, kExitReplaceOffset1, kExitReplaceOffset2, kExitReplace1S0, kExitSpecial, kExitDoAgain, kExitRejectAlt, kExitNone // 255 } ExitReason; typedef enum { kExitDstSpaceFull_2 = 32767, // 0x7fff kExitIllegalStructure_2, // 32768 0x8000 kExitOK_2, // 32769 0x8001 kExitReject_2, // ... kExitReplace1_2, kExitReplace2_2, kExitReplace3_2, kExitReplace21_2, kExitReplace31_2, kExitReplace32_2, kExitReplaceOffset1_2, kExitReplaceOffset2_2, kExitReplace1S0_2, kExitSpecial_2, kExitDoAgain_2, kExitRejectAlt_2, kExitNone_2 // 32783 0x800f } ExitReason_2; // This struct represents one entire state table. The three initialized byte // areas are state_table, remap_base, and remap_string. state0 and state0_size // give the byte offset and length within state_table of the initial state -- // table lookups are expected to start and end in this state, but for // truncated UTF-8 strings, may end in a different state. These allow a quick // test for that condition. entry_shift is 8 for tables subscripted by a full // byte value and 6 for space-optimized tables subscripted by only six // significant bits in UTF-8 continuation bytes. typedef struct { const uint32 state0; const uint32 state0_size; const uint32 total_size; const int max_expand; const int entry_shift; const int bytes_per_entry; const uint32 losub; const uint32 hiadd; const uint8* state_table; const RemapEntry* remap_base; const uint8* remap_string; const uint8* fast_state; } UTF8StateMachineObj; // Near-duplicate declaration for tables with two-byte entries typedef struct { const uint32 state0; const uint32 state0_size; const uint32 total_size; const int max_expand; const int entry_shift; const int bytes_per_entry; const uint32 losub; const uint32 hiadd; const unsigned short* state_table; const RemapEntry* remap_base; const uint8* remap_string; const uint8* fast_state; } UTF8StateMachineObj_2; typedef UTF8StateMachineObj UTF8PropObj; typedef UTF8StateMachineObj UTF8ScanObj; typedef UTF8StateMachineObj UTF8ReplaceObj; typedef UTF8StateMachineObj_2 UTF8PropObj_2; typedef UTF8StateMachineObj_2 UTF8ReplaceObj_2; // NOT IMPLEMENTED typedef UTF8StateMachineObj_2 UTF8ScanObj_2; // Look up property of one UTF-8 character and advance over it // Return 0 if input length is zero // Return 0 and advance one byte if input is ill-formed uint8 UTF8GenericProperty(const UTF8PropObj* st, const uint8** src, int* srclen); // Look up property of one UTF-8 character (assumed to be valid). // (This is a faster version of UTF8GenericProperty.) bool UTF8HasGenericProperty(const UTF8PropObj& st, const char* src); // BigOneByte versions are needed for tables > 240 states, but most // won't need the TwoByte versions. // Look up property of one UTF-8 character and advance over it // Return 0 if input length is zero // Return 0 and advance one byte if input is ill-formed uint8 UTF8GenericPropertyBigOneByte(const UTF8PropObj* st, const uint8** src, int* srclen); // TwoByte versions are needed for tables > 240 states that don't fit onto // BigOneByte -- rare ultimate fallback // Look up property of one UTF-8 character (assumed to be valid). // (This is a faster version of UTF8GenericProperty.) bool UTF8HasGenericPropertyBigOneByte(const UTF8PropObj& st, const char* src); // Look up property of one UTF-8 character and advance over it // Return 0 if input length is zero // Return 0 and advance one byte if input is ill-formed uint8 UTF8GenericPropertyTwoByte(const UTF8PropObj_2* st, const uint8** src, int* srclen); // Look up property of one UTF-8 character (assumed to be valid). // (This is a faster version of UTF8GenericProperty.) bool UTF8HasGenericPropertyTwoByte(const UTF8PropObj_2& st, const char* src); // Scan a UTF-8 stringpiece based on a state table. // Always scan complete UTF-8 characters // Set number of bytes scanned. Return reason for exiting int UTF8GenericScan(const UTF8ScanObj* st, const StringPiece& str, int* bytes_consumed); // Scan a UTF-8 stringpiece based on state table, copying to output stringpiece // and doing text replacements. // Always scan complete UTF-8 characters // Set number of bytes consumed from input, number filled to output. // Return reason for exiting // Also writes an optional OffsetMap. Pass NULL to skip writing one. int UTF8GenericReplace(const UTF8ReplaceObj* st, const StringPiece& istr, StringPiece& ostr, bool is_plain_text, int* bytes_consumed, int* bytes_filled, int* chars_changed, OffsetMap* offsetmap); // Older version without offsetmap int UTF8GenericReplace(const UTF8ReplaceObj* st, const StringPiece& istr, StringPiece& ostr, bool is_plain_text, int* bytes_consumed, int* bytes_filled, int* chars_changed); // Older version without is_plain_text or offsetmap int UTF8GenericReplace(const UTF8ReplaceObj* st, const StringPiece& istr, StringPiece& ostr, int* bytes_consumed, int* bytes_filled, int* chars_changed); // TwoByte version is needed for tables > about 256 states, such // as the table for full Unicode 4.1 canonical + compatibility mapping // Scan a UTF-8 stringpiece based on state table with two-byte entries, // copying to output stringpiece // and doing text replacements. // Always scan complete UTF-8 characters // Set number of bytes consumed from input, number filled to output. // Return reason for exiting // Also writes an optional OffsetMap. Pass NULL to skip writing one. int UTF8GenericReplaceTwoByte(const UTF8ReplaceObj_2* st, const StringPiece& istr, StringPiece& ostr, bool is_plain_text, int* bytes_consumed, int* bytes_filled, int* chars_changed, OffsetMap* offsetmap); // Older version without offsetmap int UTF8GenericReplaceTwoByte(const UTF8ReplaceObj_2* st, const StringPiece& istr, StringPiece& ostr, bool is_plain_text, int* bytes_consumed, int* bytes_filled, int* chars_changed); // Older version without is_plain_text or offsetmap int UTF8GenericReplaceTwoByte(const UTF8ReplaceObj_2* st, const StringPiece& istr, StringPiece& ostr, int* bytes_consumed, int* bytes_filled, int* chars_changed); static const unsigned char kUTF8LenTbl[256] = { 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2, 3,3,3,3,3,3,3,3, 3,3,3,3,3,3,3,3, 4,4,4,4,4,4,4,4, 4,4,4,4,4,4,4,4 }; inline int UTF8OneCharLen(const char* in) { return kUTF8LenTbl[*reinterpret_cast(in)]; } // Adjust a stringpiece to encompass complete UTF-8 characters. // The data pointer will be increased by 0..3 bytes to get to a character // boundary, and the length will then be decreased by 0..3 bytes // to encompass the last complete character. // This is useful especially when a UTF-8 string must be put into a fixed- // maximum-size buffer cleanly, such as a MySQL buffer. void UTF8TrimToChars(StringPiece* istr); } // End namespace CLD2 #endif // UTIL_UTF8_UTF8STATETABLE_H_