/***************************************************************************** # Copyright (C) 1994-2008 by David Gordon. # All rights reserved. # # This software is part of a beta-test version of the Consed/Autofinish # package. It should not be redistributed or # used for any commercial purpose, including commercially funded # sequencing, without written permission from the author and the # University of Washington. # # This software is provided ``AS IS'' and any express or implied # warranties, including, but not limited to, the implied warranties of # merchantability and fitness for a particular purpose, are disclaimed. # In no event shall the authors or the University of Washington be # liable for any direct, indirect, incidental, special, exemplary, or # consequential damages (including, but not limited to, procurement of # substitute goods or services; loss of use, data, or profits; or # business interruption) however caused and on any theory of liability, # whether in contract, strict liability, or tort (including negligence # or otherwise) arising in any way out of the use of this software, even # if advised of the possibility of such damage. # # Building Consed from source is error prone and not simple which is # why I provide executables. Due to time limitations I cannot # provide any assistance in building Consed. Even if you do not # modify the source, you may introduce errors due to using a # different version of the compiler, a different version of motif, # different versions of other libraries than I used, etc. For this # reason, if you discover Consed bugs, I can only offer help with # those bugs if you first reproduce those bugs with an executable # provided by me--not an executable you have built. # # Modifying Consed is also difficult. Although Consed is modular, # some modules are used by many other modules. Thus making a change # in one place can have unforeseen effects on many other features. # It may takes months for you to notice these other side-effects # which may not seen connected at all. It is not feasable for me to # provide help with modifying Consed sources because of the # potentially huge amount of time involved. # #*****************************************************************************/ #include "rwctokenizer.h" #include "rwcstring.h" #include RWCSubString RWCTokenizer :: operator()() { // we must have been pointing to a separator so skip over it. If // we are just starting, then nAfterLastToken_ == -1 so advance it to 0 ++nAfterLastToken_; // skip over any leading separators while( nAfterLastToken_ < soStringToBeTokenized_.length() && ( isspace( soStringToBeTokenized_( nAfterLastToken_ ) ) || soStringToBeTokenized_( nAfterLastToken_ ) == '\0' ) ) ++nAfterLastToken_; // case in which there is no additional token if ( nAfterLastToken_ >= soStringToBeTokenized_.length() ) { RWCSubString sub( &soStringToBeTokenized_, 0, 0 ); return( sub ); } int nStartToken = nAfterLastToken_; // this is because we already found (above) that // nAfterLastToken_ points to a non-separator. Thus there // is no need to check that it is a non-separator again. ++nAfterLastToken_; while( nAfterLastToken_ < soStringToBeTokenized_.length() && !isspace( soStringToBeTokenized_( nAfterLastToken_ ) ) && soStringToBeTokenized_( nAfterLastToken_ ) != '\0' ) ++nAfterLastToken_; // at this point, nAfterLastToken_ points to after the last token // or to after the end of the string. Thus there is no + 1 below. int nLength = nAfterLastToken_ - nStartToken; // cout << "nStartToken = " << nStartToken << " nAfterLastToken_ = " << // nAfterLastToken_ << endl; RWCSubString sub( &soStringToBeTokenized_, nStartToken, nLength ); return( sub ); } RWCSubString RWCTokenizer :: operator()( const char cSeparator ) { ++nAfterLastToken_; // skip over any leading tokens while( nAfterLastToken_ < soStringToBeTokenized_.length() && soStringToBeTokenized_[ nAfterLastToken_ ] == cSeparator ) ++nAfterLastToken_; // case in which there are no additional tokens if ( nAfterLastToken_ >= soStringToBeTokenized_.length() ) { RWCSubString sub( &soStringToBeTokenized_, 0, 0 ); return( sub ); } int nStartToken = nAfterLastToken_; // the above loop stopped when nAfterLastToken_ pointed to a non-separator // so skip over this non-separator ++nAfterLastToken_; while( nAfterLastToken_ < soStringToBeTokenized_.length() && soStringToBeTokenized_[ nAfterLastToken_ ] != cSeparator ) ++nAfterLastToken_; // at this point, nAfterLastToken_ either points to just after a found // token or else it points to the end of the string int nLength = nAfterLastToken_ - nStartToken; RWCSubString sub( &soStringToBeTokenized_, nStartToken, nLength ); return( sub ); } RWCSubString RWCTokenizer :: soGetToken( const int n0Token ) { int n0Pos = 0; // skip over any leading tokens while( n0Pos < soStringToBeTokenized_.length() && isspace( soStringToBeTokenized_[ n0Pos ] ) ) { ++n0Pos; } if ( n0Pos >= soStringToBeTokenized_.length() ) { // case of no tokens in string return RWCSubString( NULL, 0, 0); } // skip over tokens = n0Token-1 for( int nTokensToSkipOver = 0; nTokensToSkipOver < n0Token; ++nTokensToSkipOver ) { // first get to end of next token while( n0Pos < soStringToBeTokenized_.length() && isspace( soStringToBeTokenized_[ n0Pos ] ) ) { ++n0Pos; } // now skip whitespace while( n0Pos < soStringToBeTokenized_.length() && isspace( soStringToBeTokenized_[ n0Pos ] ) ) { ++n0Pos; } } // if, in skipping over these tokens, we reached the end of the // string, there is not a n0Token token if ( n0Pos >= soStringToBeTokenized_.length() ) return RWCSubString( NULL, 0, 0); // otherwise, n0Pos must be positioned on the starting character // of the token we want--a nonspace char after a space char int n0StartPos = n0Pos; // now find the end while( n0Pos < soStringToBeTokenized_.length() && !isspace( soStringToBeTokenized_[ n0Pos ] ) ) { ++n0Pos; } // the loop above ends 1 too far--either past the end of the string, // or else on a space char. int nLengthOfToken = n0Pos - n0StartPos; return RWCSubString( &soStringToBeTokenized_, n0StartPos, nLengthOfToken ); }