/* * Copyright 2001-2002,2004 The Apache Software Foundation. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ /* * $Id: UnicodeRangeFactory.cpp 191054 2005-06-17 02:56:35Z jberry $ */ // --------------------------------------------------------------------------- // Includes // --------------------------------------------------------------------------- #include #include #include #include #include #include XERCES_CPP_NAMESPACE_BEGIN // --------------------------------------------------------------------------- // Local data // --------------------------------------------------------------------------- const int UNICATEGSIZE = 37; const unsigned short CHAR_LETTER = 30; const unsigned short CHAR_MARK = 31; const unsigned short CHAR_NUMBER = 32; const unsigned short CHAR_SEPARATOR = 33; const unsigned short CHAR_OTHER = 34; const unsigned short CHAR_PUNCTUATION = 35; const unsigned short CHAR_SYMBOL = 36; const XMLCh uniCategNames[][UNICATEGSIZE] = { {chLatin_C, chLatin_n, chNull}, {chLatin_L, chLatin_u, chNull}, {chLatin_L, chLatin_l, chNull}, {chLatin_L, chLatin_t, chNull}, {chLatin_L, chLatin_m, chNull}, {chLatin_L, chLatin_o, chNull}, {chLatin_M, chLatin_n, chNull}, {chLatin_M, chLatin_e, chNull}, {chLatin_M, chLatin_c, chNull}, {chLatin_N, chLatin_d, chNull}, {chLatin_N, chLatin_l, chNull}, {chLatin_N, chLatin_o, chNull}, {chLatin_Z, chLatin_s, chNull}, {chLatin_Z, chLatin_l, chNull}, {chLatin_Z, chLatin_p, chNull}, {chLatin_C, chLatin_c, chNull}, {chLatin_C, chLatin_f, chNull}, {chLatin_C, chLatin_o, chNull}, {chLatin_C, chLatin_s, chNull}, {chLatin_P, chLatin_d, chNull}, {chLatin_P, chLatin_s, chNull}, {chLatin_P, chLatin_e, chNull}, {chLatin_P, chLatin_c, chNull}, {chLatin_P, chLatin_o, chNull}, {chLatin_S, chLatin_m, chNull}, {chLatin_S, chLatin_c, chNull}, {chLatin_S, chLatin_k, chNull}, {chLatin_S, chLatin_o, chNull}, {chLatin_P, chLatin_i, chNull}, {chLatin_P, chLatin_f, chNull}, {chLatin_L, chNull}, {chLatin_M, chNull}, {chLatin_N, chNull}, {chLatin_Z, chNull}, {chLatin_C, chNull}, {chLatin_P, chNull}, {chLatin_S, chNull}, }; // --------------------------------------------------------------------------- // UnicodeRangeFactory: Constructors and Destructor // --------------------------------------------------------------------------- UnicodeRangeFactory::UnicodeRangeFactory() { } UnicodeRangeFactory::~UnicodeRangeFactory() { } // --------------------------------------------------------------------------- // UnicodeRangeFactory: Range creation methods // --------------------------------------------------------------------------- void UnicodeRangeFactory::buildRanges(RangeTokenMap *rangeTokMap) { if (fRangesCreated) return; if (!fKeywordsInitialized) { initializeKeywordMap(rangeTokMap); } TokenFactory* tokFactory = rangeTokMap->getTokenFactory(); RangeToken* ranges[UNICATEGSIZE]; RangeToken* tok; for (int i=0; i < UNICATEGSIZE; i++) { ranges[i] = tokFactory->createRange(); } for (int j=0; j < 0x10000; j++) { unsigned short charType = XMLUniCharacter::getType(j); ranges[charType]->addRange(j, j); charType = getUniCategory(charType); ranges[charType]->addRange(j, j); } ranges[XMLUniCharacter::UNASSIGNED]->addRange(0x10000, Token::UTF16_MAX); for (int k=0; k < UNICATEGSIZE; k++) { tok = (RangeToken*) RangeToken::complementRanges(ranges[k], tokFactory); // build the internal map. tok->createMap(); rangeTokMap->setRangeToken(uniCategNames[k], ranges[k]); rangeTokMap->setRangeToken(uniCategNames[k], tok , true); } // Create all range tok = tokFactory->createRange(); tok->addRange(0, Token::UTF16_MAX); // build the internal map. tok->createMap(); rangeTokMap->setRangeToken(fgUniAll, tok); // Create alpha range tok = tokFactory->createRange(); tok->mergeRanges(ranges[XMLUniCharacter::UPPERCASE_LETTER]); tok->mergeRanges(ranges[XMLUniCharacter::LOWERCASE_LETTER]); tok->mergeRanges(ranges[XMLUniCharacter::OTHER_LETTER]); // build the internal map. tok->createMap(); rangeTokMap->setRangeToken(fgUniIsAlpha, tok); // Create alpha-num range RangeToken* alnumTok = tokFactory->createRange(); alnumTok->mergeRanges(tok); alnumTok->mergeRanges(ranges[XMLUniCharacter::DECIMAL_DIGIT_NUMBER]); // build the internal map. alnumTok->createMap(); rangeTokMap->setRangeToken(fgUniIsAlnum, alnumTok); // Create word range tok = tokFactory->createRange(); tok->mergeRanges(alnumTok); tok->addRange(chUnderscore, chUnderscore); // build the internal map. tok->createMap(); rangeTokMap->setRangeToken(fgUniIsWord, tok); tok = (RangeToken*) RangeToken::complementRanges(tok, tokFactory); // build the internal map. tok->createMap(); rangeTokMap->setRangeToken(fgUniIsWord, tok , true); // Create assigned range tok = (RangeToken*)RangeToken::complementRanges( ranges[XMLUniCharacter::UNASSIGNED], tokFactory, tokFactory->getMemoryManager()); // build the internal map. tok->createMap(); rangeTokMap->setRangeToken(fgUniAssigned,tok); // Create space range tok = tokFactory->createRange(); tok->mergeRanges(ranges[XMLUniCharacter::SPACE_SEPARATOR]); tok->mergeRanges(ranges[XMLUniCharacter::LINE_SEPARATOR]); //tok->mergeRanges(ranges[XMLUniCharacter::PARAGRAPH_SEPARATOR]); // build the internal map. tok->createMap(); rangeTokMap->setRangeToken(fgUniIsSpace, tok); tok = (RangeToken*) RangeToken::complementRanges(tok, tokFactory); // build the internal map. tok->createMap(); rangeTokMap->setRangeToken(fgUniIsSpace, tok , true); // build the internal maps. for (int l=0; l < UNICATEGSIZE; l++) { ranges[l]->createMap(); } fRangesCreated = true; } // --------------------------------------------------------------------------- // UnicodeRangeFactory: Initialization methods // --------------------------------------------------------------------------- void UnicodeRangeFactory::initializeKeywordMap(RangeTokenMap *rangeTokMap) { if (fKeywordsInitialized) return; for (int k=0; k < UNICATEGSIZE; k++) { rangeTokMap->addKeywordMap(uniCategNames[k], fgUnicodeCategory); } rangeTokMap->addKeywordMap(fgUniAll, fgUnicodeCategory); rangeTokMap->addKeywordMap(fgUniIsAlpha, fgUnicodeCategory); rangeTokMap->addKeywordMap(fgUniIsAlnum, fgUnicodeCategory); rangeTokMap->addKeywordMap(fgUniIsWord, fgUnicodeCategory); rangeTokMap->addKeywordMap(fgUniAssigned, fgUnicodeCategory); rangeTokMap->addKeywordMap(fgUniIsSpace, fgUnicodeCategory); fKeywordsInitialized = true; } // --------------------------------------------------------------------------- // UnicodeRangeFactory: Helper methods // --------------------------------------------------------------------------- unsigned short UnicodeRangeFactory::getUniCategory(const unsigned short type) { switch(type) { case XMLUniCharacter::UPPERCASE_LETTER: case XMLUniCharacter::LOWERCASE_LETTER: case XMLUniCharacter::TITLECASE_LETTER: case XMLUniCharacter::MODIFIER_LETTER: case XMLUniCharacter::OTHER_LETTER: return CHAR_LETTER; case XMLUniCharacter::NON_SPACING_MARK: case XMLUniCharacter::COMBINING_SPACING_MARK: case XMLUniCharacter::ENCLOSING_MARK: return CHAR_MARK; case XMLUniCharacter::DECIMAL_DIGIT_NUMBER: case XMLUniCharacter::LETTER_NUMBER: case XMLUniCharacter::OTHER_NUMBER: return CHAR_NUMBER; case XMLUniCharacter::SPACE_SEPARATOR: case XMLUniCharacter::LINE_SEPARATOR: case XMLUniCharacter::PARAGRAPH_SEPARATOR: return CHAR_SEPARATOR; case XMLUniCharacter::CONTROL: case XMLUniCharacter::FORMAT: case XMLUniCharacter::SURROGATE: case XMLUniCharacter::PRIVATE_USE: case XMLUniCharacter::UNASSIGNED: return CHAR_OTHER; case XMLUniCharacter::CONNECTOR_PUNCTUATION: case XMLUniCharacter::DASH_PUNCTUATION: case XMLUniCharacter::START_PUNCTUATION: case XMLUniCharacter::END_PUNCTUATION: case XMLUniCharacter::OTHER_PUNCTUATION: case XMLUniCharacter::INITIAL_PUNCTUATION: case XMLUniCharacter::FINAL_PUNCTUATION: return CHAR_PUNCTUATION; case XMLUniCharacter::MATH_SYMBOL: case XMLUniCharacter::CURRENCY_SYMBOL: case XMLUniCharacter::MODIFIER_SYMBOL: case XMLUniCharacter::OTHER_SYMBOL: return CHAR_SYMBOL; } return 0; } XERCES_CPP_NAMESPACE_END /** * End of file UnicodeRangeFactory.cpp */