/*****************************************************************************
#   Copyright (C) 1994-2008 by David Gordon.
#   All rights reserved.                           
#                                                                           
#   This software is part of a beta-test version of the Consed/Autofinish
#   package.  It should not be redistributed or
#   used for any commercial purpose, including commercially funded
#   sequencing, without written permission from the author and the
#   University of Washington.
#   
#   This software is provided ``AS IS'' and any express or implied
#   warranties, including, but not limited to, the implied warranties of
#   merchantability and fitness for a particular purpose, are disclaimed.
#   In no event shall the authors or the University of Washington be
#   liable for any direct, indirect, incidental, special, exemplary, or
#   consequential damages (including, but not limited to, procurement of
#   substitute goods or services; loss of use, data, or profits; or
#   business interruption) however caused and on any theory of liability,
#   whether in contract, strict liability, or tort (including negligence
#   or otherwise) arising in any way out of the use of this software, even
#   if advised of the possibility of such damage.
#
#   Building Consed from source is error prone and not simple which is
#   why I provide executables.  Due to time limitations I cannot
#   provide any assistance in building Consed.  Even if you do not
#   modify the source, you may introduce errors due to using a
#   different version of the compiler, a different version of motif,
#   different versions of other libraries than I used, etc.  For this
#   reason, if you discover Consed bugs, I can only offer help with
#   those bugs if you first reproduce those bugs with an executable
#   provided by me--not an executable you have built.
# 
#   Modifying Consed is also difficult.  Although Consed is modular,
#   some modules are used by many other modules.  Thus making a change
#   in one place can have unforeseen effects on many other features.
#   It may takes months for you to notice these other side-effects
#   which may not seen connected at all.  It is not feasable for me to
#   provide help with modifying Consed sources because of the
#   potentially huge amount of time involved.
#
#*****************************************************************************/
#include    "rwctokenizer.h"
#include    "rwcstring.h"
#include    <ctype.h>


RWCSubString RWCTokenizer :: operator()() {
   
   // we must have been pointing to a separator so skip over it.  If 
   // we are just starting, then nAfterLastToken_ == -1 so advance it to 0
   ++nAfterLastToken_;

   // skip over any leading separators
   while( nAfterLastToken_ < soStringToBeTokenized_.length() &&
          ( isspace( soStringToBeTokenized_( nAfterLastToken_ ) ) ||
            soStringToBeTokenized_( nAfterLastToken_ ) == '\0' ) )
      ++nAfterLastToken_;


   // case in which there is no additional token
   if ( nAfterLastToken_ >= soStringToBeTokenized_.length() ) {
      RWCSubString sub( &soStringToBeTokenized_, 0, 0 );
      return( sub );
   }

   int nStartToken = nAfterLastToken_;


   // this is because we already found (above) that
   // nAfterLastToken_ points to a non-separator.  Thus there
   // is no need to check that it is a non-separator again.

   ++nAfterLastToken_;

   while( nAfterLastToken_ < soStringToBeTokenized_.length() &&
          !isspace( soStringToBeTokenized_( nAfterLastToken_ ) ) &&
          soStringToBeTokenized_( nAfterLastToken_ ) != '\0' )
      ++nAfterLastToken_;

   // at this point, nAfterLastToken_ points to after the last token
   // or to after the end of the string.  Thus there is no + 1 below.

   int nLength = nAfterLastToken_ - nStartToken;

   //   cout << "nStartToken = " << nStartToken << " nAfterLastToken_ = " <<
   //      nAfterLastToken_ << endl;

   RWCSubString sub( &soStringToBeTokenized_, nStartToken, nLength );
   return( sub );
}


RWCSubString RWCTokenizer :: operator()( const char cSeparator ) {

   
   ++nAfterLastToken_;

   // skip over any leading tokens
   while( nAfterLastToken_ < soStringToBeTokenized_.length() &&
          soStringToBeTokenized_[ nAfterLastToken_ ] == cSeparator )
      ++nAfterLastToken_;

   // case in which there are no additional tokens

   if ( nAfterLastToken_ >= soStringToBeTokenized_.length() ) {
      RWCSubString sub( &soStringToBeTokenized_, 0, 0 );
      return( sub );
   }

   int nStartToken = nAfterLastToken_;

   // the above loop stopped when nAfterLastToken_ pointed to a non-separator
   // so skip over this non-separator

   ++nAfterLastToken_;

   while( nAfterLastToken_ < soStringToBeTokenized_.length() &&
          soStringToBeTokenized_[ nAfterLastToken_ ] != cSeparator ) 
      ++nAfterLastToken_;


   // at this point, nAfterLastToken_ either points to just after a found
   // token or else it points to the end of the string

   int nLength = nAfterLastToken_ - nStartToken;

   RWCSubString sub( &soStringToBeTokenized_, nStartToken, nLength );
   return( sub );
}


RWCSubString RWCTokenizer :: soGetToken( const int n0Token ) {

   int n0Pos = 0;

   // skip over any leading tokens

   while( n0Pos < soStringToBeTokenized_.length() &&
          isspace( soStringToBeTokenized_[ n0Pos ] ) ) {
      ++n0Pos;
   }


   if ( n0Pos >= soStringToBeTokenized_.length() ) {
      // case of no tokens in string
      return RWCSubString( NULL, 0, 0);
   }

   // skip over tokens = n0Token-1
   for( int nTokensToSkipOver = 0; nTokensToSkipOver < n0Token; 
        ++nTokensToSkipOver ) {
      // first get to end of next token

      while( n0Pos < soStringToBeTokenized_.length() &&
             isspace( soStringToBeTokenized_[ n0Pos ] ) ) {
         ++n0Pos;
      }
      
      // now skip whitespace

      while( n0Pos < soStringToBeTokenized_.length() &&
             isspace( soStringToBeTokenized_[ n0Pos ] ) ) {
         ++n0Pos;
      }
   }

   // if, in skipping over these tokens, we reached the end of the
   // string, there is not a n0Token token

   if ( n0Pos >= soStringToBeTokenized_.length() )
      return RWCSubString( NULL, 0, 0);


   // otherwise, n0Pos must be positioned on the starting character
   // of the token we want--a nonspace char after a space char

   int n0StartPos = n0Pos;

   // now find the end

   while( n0Pos < soStringToBeTokenized_.length() &&
          !isspace( soStringToBeTokenized_[ n0Pos ] ) ) {
      ++n0Pos;
   }
   
   // the loop above ends 1 too far--either past the end of the string,
   // or else on a space char.

   int nLengthOfToken = n0Pos - n0StartPos;

   return RWCSubString( &soStringToBeTokenized_, n0StartPos, nLengthOfToken );
}