//
// Copyright 2009 Illumina, Inc.
//
// This software is covered by the "Illumina Genome Analyzer Software
// License Agreement" and the "Illumina Source Code License Agreement",
// and certain third party copyright/licenses, and any user of this
// source file is bound by the terms therein (see accompanying files
// Illumina_Genome_Analyzer_Software_License_Agreement.pdf and
// Illumina_Source_Code_License_Agreement.pdf and third party
// copyright/license notices).
//
//

/// \file

/// \author Ivan Mikoulitch
///

#include "SamMD/fastaReader.h"
#include "stringSplitterJet.h"
#include "SamMD/log.h"
#include "SamMD/utils.h"

//////////////////////////////////////////////////////////////////////
FastaReader::FastaReader(void)
{
}

//////////////////////////////////////////////////////////////////////
FastaReader::FastaReader(string fileName)
{
	fileName_=fileName;
}

//////////////////////////////////////////////////////////////////////
FastaReader::~FastaReader(void)
{
	Close();	
}

//////////////////////////////////////////////////////////////////////
bool FastaReader::Open()
{
//	pos_=0;

	fs_.open(fileName_.c_str());

	return fs_.is_open();
}

//////////////////////////////////////////////////////////////////////
void FastaReader::Close()
{
	if(fs_.is_open()) fs_.close();

	fileName_.clear();
	ref_header_.clear();
	ref_header_next_.clear();
	ref_sequence_.clear();

//	line_.clear();
//	sequences_.clear();
//	current_ref_sequence_.clear();
//	current_ref_id.clear();
}

/*
//////////////////////////////////////////////////////////////////////
string FastaReader::GetNextFastaReference()
{
	string line;

	while(getline(fs_, line))
	{
		if(line.empty()) continue;

		if(IsHeaderLine(line)) 
		{
			// reading first sequence line
			getline(fs_, line_);

			pos_=0;
			start_=1;
			end_ = start_-1+line_.size();

			return line;
		}
	}

	return "";
}

//////////////////////////////////////////////////////////////////////
void FastaReader::ReadSequences()
{
	ifstream fs;

	try
	{
		fs.open(fileName_.c_str());

		if(!fs.is_open()) return;

		ReadSequences(fs);
	}
	catch(...)
	{
	}

	if(fs.is_open()) fs.close();
}
*/

//////////////////////////////////////////////////////////////////////
bool inline FastaReader::IsHeaderLine(string& line)
{
	return line.size()>0 && line[0]=='>';
}

//////////////////////////////////////////////////////////////////////
bool FastaReader::ReadNextReference()
{
	ref_sequence_.clear();

	if(fs_.eof()) 
		return false;

	// looking for the first header line
	if(ref_header_next_.empty())
	{
		while(getline(fs_, ref_header_))
		{
			if(IsHeaderLine(ref_header_)) break;
		}
	}
	else
	{
		ref_header_=ref_header_next_;
		ref_header_next_.clear();
	}

	if (ref_header_.empty()) return false; // end of file

	Log::Info(">>> Reading reference '" + ref_header_ + "'");
	Log::Info(">>> From file: '" + fileName_ + "'.");

	const int BUFFER_SIZE = 4096;
	char buffer[BUFFER_SIZE];

	unsigned long num_reads=0;

	while(fs_.getline(buffer, BUFFER_SIZE))
	{
		if(buffer[0]=='>') // new sequence
		{
			ref_header_next_.assign(buffer);
			break;
		}
		else
		{
			ToUpper(buffer);
			ref_sequence_.append(buffer);
		}

		if (++num_reads % 100000 == 0) Log::Info("   # reference reads = " + to_string(num_reads) + ".");
	}

	Log::Info("<<< Done.");

	return !ref_header_.empty() && !ref_sequence_.empty();
}

/*
//////////////////////////////////////////////////////////////////////
void FastaReader::ReadSequences(ifstream& fs)
{
	string line;
	string header_line;
	string seq_line;

	// looking for the first header line
	while(getline(fs, header_line))
	{
		if(IsHeaderLine(header_line)) break;
	}

	if (header_line.empty()) return; // end of file

	while(getline(fs, line))
	{
		if(line.empty()) continue;

		if(IsHeaderLine(line)) // new sequence
		{
			sequences_[header_line]=seq_line;
			header_line=line;
			seq_line.clear();
		}
		else
		{
			seq_line.append(line);
		}
	}

	if(!header_line.empty() && !seq_line.empty())
	{
		sequences_[header_line]=seq_line;
	}

	return;
}

//////////////////////////////////////////////////////////////////////
string FastaReader::GetSequence(const string& ref_id)
{
	string ref_id_gt=">"+ref_id;

	map<string, string>::const_iterator p = sequences_.begin();

	while(p!=sequences_.end())
	{
		if(0==ref_id_gt.compare(0, ref_id_gt.size(), p->first))
		{
			return p->second;
		}

		++p;
	}
}

//////////////////////////////////////////////////////////////////////
string FastaReader::GetMatchDescriptor(const string& read, const string& ref_id, long ref_position, const string& cigar)
{
	if (current_ref_id!=ref_id)
	{
		string sequence = GetSequence(ref_id);
		if(sequence.empty()) return "";
		current_ref_sequence_=sequence;
		current_ref_id=ref_id;
	}												  

	char cigar_operation;
	int cigar_length;
	int cigar_position=0;
	int read_position=0;

	ostringstream match_descriptor;

	for(int i=0; i<cigar.size(); i++)
	{
		switch(cigar[i])
		{
		case 'M':
		case 'I':
		case 'D':
			cigar_operation=cigar[i];
			cigar_length = atoi(cigar.substr(cigar_position, i-cigar_position).c_str());
			cigar_position=i+1;
			break;
		default:
			continue;
		}

		switch(cigar[i])
		{
			case 'M':
			{
				int match_length=0;

				for(int j=0; j<cigar_length; j++)
				{
					if(read[read_position]==current_ref_sequence_[ref_position]) 
					{
						match_length++;
					}
					else 
					{
						if (match_length>0) 
						{
							match_descriptor << match_length;
							match_length=0;
						}

						match_descriptor << current_ref_sequence_[ref_position];
					}

					++read_position;
					++ref_position;
				}

				if (match_length>0) match_descriptor << match_length;

				break;
			}
			case 'I':
			{
				match_descriptor << '^';
				
				for(int j=0; i<cigar_length; j++)
				{
					match_descriptor << current_ref_sequence_[ref_position++];
				}
				
				match_descriptor << '$';
				
				break;
			}
			case 'D':
			{
				match_descriptor << '^' << cigar_length << '$';

				break;
			}
		}
	}

	return match_descriptor.str();
}
*/

const int BUFFER_SIZE = 4096;
char buffer[BUFFER_SIZE];

//////////////////////////////////////////////////////////////////////
string FastaReader::GetMatchDescriptorInfo(const string& sam_data)
{
	int num_char_copied = sam_data.copy(buffer, sam_data.size());
	buffer[num_char_copied] = 0x0;
	ToUpper(buffer);

	// parsing sam data
	StringSplitterJet data_ss(buffer);
	string ref_position_str = data_ss.GetString(1);
	string cigar = data_ss.GetString(2);
	string read  = data_ss.GetString(3);

	unsigned int ref_position = atoi(ref_position_str.c_str());

	// to make zero based
	ref_position--;

	//char cigar_operation;
	int cigar_length;
	string cigar_length_str;
	int cigar_position=0;
	int read_position=0;
	string s_start_length;
	string s_end_length;

	ostringstream match_descriptor;

        int match_length=0;

	for(unsigned int i=0; i<cigar.size(); i++)
	{

		switch(cigar[i])
		{
		case 'M':
		case 'I':
		case 'D':
		case 'N':
		case 'S':
		case 'H':
		case 'P':
			//cigar_operation=cigar[i];
			cigar_length_str = cigar.substr(cigar_position, i-cigar_position);
			cigar_length = atoi(cigar_length_str.c_str());
			cigar_position=i+1;
			if (cigar_length>150 && (cigar[i]=='D' or cigar[i]=='N')) // csaunders - change to treat 'N' exactly like 'D'
			{
				// to exclude this read
				return "";
			}
			break;
		default:
			continue;
		}

		switch(cigar[i])
		{
			case 'M':
			{
				for(int j=0; j<cigar_length; j++)
				{
					if(read[read_position]==ref_sequence_[ref_position]) 
					{
						match_length++;
					}
					else 
					{
						if (match_length>0) 
						{
							match_descriptor << match_length;
							match_length=0;
						}

//						match_descriptor << ref_sequence_[ref_position];
						
						// Modified by request from Chris Saunders:
						// I don't anticipate any aligners/packages besides CASAVA will support 
						// export format in the future, and I'm not confident that any component 
						// of CASAVA will be able to handle input containing ambiguous characters 
						// other than 'N'. Unfortunately I don't think we have a choice but to compress 
						// all ambiguous characters to 'N'.
						switch(ref_sequence_[ref_position])
						{
						case 'A':
						case 'C':
						case 'G':
						case 'T':
						case 'N':
						case 'a':
						case 'c':
						case 'g':
						case 't':
						case 'n':
							{
								match_descriptor << ref_sequence_[ref_position];
								break;
							}
						default:
							{
								match_descriptor << 'N';
								break;
							}
						}

					}

					++read_position;
					++ref_position;
				}

				break;
			}
			// sam insertion makes deletion ^5$
			case 'I':
			{
                                if (match_length>0)
                                {
                                        match_descriptor << match_length;
                                        match_length=0;
                                }
				match_descriptor << '^' << cigar_length << '$';

				read_position+=cigar_length;
				
				break;
			}
			// sam deletion makes insertion ^ACGT$
			case 'D':
                        case 'N':   // csaunders -- change to treat 'N' exactly like 'D'
			{
                                if (match_length>0)
                                {
                                        match_descriptor << match_length;
                                        match_length=0;
                                }
				match_descriptor << '^';
				
				for(int j=0; j<cigar_length; j++)
				{
					match_descriptor << ref_sequence_[ref_position++];
				}
				
				match_descriptor << '$';

				break;
			}
			case 'S': // ignoring S
			{
				// we are supposed to see S operation at the beginning or end of the read
				if (read_position==0) // read start
					s_start_length=cigar_length_str;
				else // read end
					s_end_length=cigar_length_str;

				// skipping S region
				read_position+=cigar_length;
				break;
			}
			case 'H': // ignoring H
			{
				// we are supposed to see S operation at the beginning or end of the read
				if (read_position==0) // read start
					s_start_length=cigar_length_str;
				else // read end
					s_end_length=cigar_length_str;

				// not skipping H region

				break;
			}
			case 'P': // not supported
			{
				break;
			}
		}
	}

        if (match_length>0)
        {
                match_descriptor << match_length;
                match_length=0;
        }

	string mdi(1,'\t'); // first tab means that the line was processed to create match descriptor
// not needed
//	mdi.append(qname); 
//	mdi.append(1,'\t');
	mdi.append(match_descriptor.str());
	mdi.append(1,'\t');
	mdi.append(s_start_length.size()?s_start_length:"0");
	mdi.append(1,'\t');
	mdi.append(s_end_length.size()?s_end_length:"0");

	return mdi;
}