#include "FileRecordTypeChecker.h" #include "api/BamReader.h" #include "ParseTools.h" FileRecordTypeChecker::FileRecordTypeChecker() : _eofHit(false), _inheader(false) { _fileType = UNKNOWN_FILE_TYPE; _recordType = UNKNOWN_RECORD_TYPE; _numFields = 0; _isBinary = false; _isText = false; _isBed = false; _isDelimited = false; _delimChar = '\t'; //tab by default _firstValidDataLineIdx = -1; _isVCF = false; _isBAM = false; _isGFF = false; _isGFFplus = false; _isGzipped = false; _isCompressed = false; _insufficientData = false; _fourthFieldNumeric = false; _givenEmptyBuffer = false; _isGroupBy = false; // TO DO: Bed4, Bed5, and BedGraph are missing from all of these. _hasName[UNKNOWN_RECORD_TYPE] = false; _hasName[EMPTY_RECORD_TYPE] = false; _hasName[BED3_RECORD_TYPE] = false; _hasName[BED6_RECORD_TYPE] = true; _hasName[BED12_RECORD_TYPE] = true; _hasName[BED_PLUS_RECORD_TYPE] = true; _hasName[BED6_PLUS_RECORD_TYPE] = true; _hasName[BAM_RECORD_TYPE] = true; _hasName[VCF_RECORD_TYPE] = true; _hasName[GFF_RECORD_TYPE] = true; _hasName[GFF_PLUS_RECORD_TYPE] = true; _hasName[NO_POS_PLUS_RECORD_TYPE] = true; _hasScore[UNKNOWN_RECORD_TYPE] = false; _hasScore[EMPTY_RECORD_TYPE] = false; _hasScore[BED3_RECORD_TYPE] = false; _hasScore[BED6_RECORD_TYPE] = true; _hasScore[BED12_RECORD_TYPE] = true; _hasScore[BED_PLUS_RECORD_TYPE] = true; _hasScore[BED6_PLUS_RECORD_TYPE] = true; _hasScore[BAM_RECORD_TYPE] = true; _hasScore[VCF_RECORD_TYPE] = true; _hasScore[GFF_RECORD_TYPE] = true; _hasScore[GFF_PLUS_RECORD_TYPE] = true; _hasScore[NO_POS_PLUS_RECORD_TYPE] = true; _hasStrand[UNKNOWN_RECORD_TYPE] = false; _hasStrand[EMPTY_RECORD_TYPE] = false; _hasStrand[BED3_RECORD_TYPE] = false; _hasStrand[BED6_RECORD_TYPE] = true; _hasStrand[BED12_RECORD_TYPE] = true; _hasStrand[BED_PLUS_RECORD_TYPE] = true; //actually, unknown. Give benefit of doubt. _hasStrand[BED6_PLUS_RECORD_TYPE] = true; _hasStrand[BAM_RECORD_TYPE] = true; _hasStrand[VCF_RECORD_TYPE] = true; _hasStrand[GFF_RECORD_TYPE] = true; _hasStrand[GFF_PLUS_RECORD_TYPE] = true; _hasStrand[NO_POS_PLUS_RECORD_TYPE] = true; _recordTypeNames[UNKNOWN_RECORD_TYPE] = "Unknown record type"; _recordTypeNames[EMPTY_RECORD_TYPE] = "Empty record type"; _recordTypeNames[BED3_RECORD_TYPE] = "Bed3 record type"; _recordTypeNames[BED6_RECORD_TYPE] = "Bed6 record type"; _recordTypeNames[BED12_RECORD_TYPE] = "Bed12 record type"; _recordTypeNames[BED_PLUS_RECORD_TYPE] = "BedPlus record type"; _recordTypeNames[BAM_RECORD_TYPE] = "BAM record type"; _recordTypeNames[VCF_RECORD_TYPE] = "VCF record type"; _recordTypeNames[GFF_RECORD_TYPE] = "Gff record type"; _recordTypeNames[GFF_PLUS_RECORD_TYPE] = "GffPlus record type"; _recordTypeNames[NO_POS_PLUS_RECORD_TYPE] = "NoPosPlus record type"; _fileTypeNames[UNKNOWN_FILE_TYPE] = "Unknown file type"; _fileTypeNames[EMPTY_FILE_TYPE] = "Empty file type"; _fileTypeNames[SINGLE_LINE_DELIM_TEXT_FILE_TYPE] = "Delimited text file type"; _fileTypeNames[GZIP_FILE_TYPE] = "Gzip file type"; _fileTypeNames[BAM_FILE_TYPE] = "BAM file type"; _fileTypeNames[VCF_FILE_TYPE] = "VCF file type"; } bool FileRecordTypeChecker::scanBuffer(const char *buffer, size_t len, bool eofHit, bool isCompressed) { _eofHit = eofHit; _isCompressed = isCompressed; _numBytesInBuffer = len; if (_numBytesInBuffer == 0) { _fileType = EMPTY_FILE_TYPE; _recordType = EMPTY_RECORD_TYPE; return true; } //special: the first thing we do is look for a gzipped file. if (!_isGzipped && ((unsigned char)(buffer[0]) == 0x1f)) { _isGzipped = true; return true; } //scan the first 8K block of the streamBuf. //now we have a buffer from the file. //first, test to see if it's binary or text. if (isBinaryBuffer(buffer, len)) { _isText = false; _isBinary = true; return true; } else { _isText = true; _isBinary = false; return handleTextFormat(buffer, len); } } bool FileRecordTypeChecker::isBinaryBuffer(const char *buffer, size_t len) { if (isBAM(buffer)) { return true; } //Let's say that in a text file, at least 90% of the characters //should be alphanumeric, whitespace, or punctuation. static const float PERCENTAGE_PRINTABLE = .9; int alphaNumCount = 0; int whiteSpaceCount = 0; int punctuationCount = 0; for (int i=0; i < (int)len; i++) { char currChar = buffer[i]; if (isalnum(currChar)) { alphaNumCount++; } else if (isspace(currChar)) { whiteSpaceCount++; } else if (ispunct(currChar)) { punctuationCount++; } } if ((float)(alphaNumCount + whiteSpaceCount + punctuationCount) / (float)(_numBytesInBuffer) < PERCENTAGE_PRINTABLE) { return true; } return false; } bool FileRecordTypeChecker::isBAM(const char *buffer) { //check for BAM. The Bam Magic String is "BAM\1", and should be the first 4 characters of the file. if (strncmp(buffer, "BAM\1", 4) == 0) { _isBAM = true; _fileType = BAM_FILE_TYPE; _recordType = BAM_RECORD_TYPE; return true; } //TBD: Handle other binary formats return false; } bool FileRecordTypeChecker::handleTextFormat(const char *buffer, size_t len) { if (isVCFformat(buffer)) { return isTextDelimtedFormat(buffer, len); } else if (isTextDelimtedFormat(buffer, len)) { //At this point, _isText and _isDelimited are set. _numFields and _delimChar are //set. _fileType = SINGLE_LINE_DELIM_TEXT_FILE_TYPE; if (_isGroupBy) { _recordType = NO_POS_PLUS_RECORD_TYPE; return true; } //Tokenize the first line of valid data into fields. //Need to make a copy so next call to tokenizer doesn't overwrite the line. string line(_tokenizer.getElem(_firstValidDataLineIdx)); // ditch \r for Windows if necessary. if (line.size() && line[line.size()-1] == '\r') { line.resize(line.size()-1); } _tokenizer.setKeepFinalIncompleteElem(Tokenizer::USE_NOW); _tokenizer.setNumExpectedItems(_numFields); _tokenizer.tokenize(line, _delimChar); if (_tokenizer.getNumFields(line, _delimChar) != _numFields) { cerr << "Error: Type checker found wrong number of fields while tokenizing data line." << endl; cerr << "Perhaps you have extra TAB at the end of your line? Check with \"cat -t\""<< endl; exit(1); } if (isBedFormat()) { _isBed = true; if (_numFields == 3) { _recordType = BED3_RECORD_TYPE; } else if (_numFields == 4) { if (isNumeric(_tokenizer.getElem(3))) { _recordType = BEDGRAPH_RECORD_TYPE; _fourthFieldNumeric = true; } else { _fourthFieldNumeric = false; _recordType = BED4_RECORD_TYPE; _hasStrand[BED4_RECORD_TYPE] = isStrandField(3); } } else if (_numFields == 5 && passesBed5()) { _recordType = BED5_RECORD_TYPE; } else if (_numFields == 6 && passesBed6()) { _recordType = BED6_RECORD_TYPE; } else if (_numFields == 12 && passesBed12()) { _recordType = BED12_RECORD_TYPE; } else if (_numFields >3) { if (_numFields >= 6 && isStrandField(5)) { _recordType = BED6_PLUS_RECORD_TYPE; } else { _recordType = BED_PLUS_RECORD_TYPE; } } return true; } if (isGFFformat()) { if (_isGFFplus) { _recordType = GFF_PLUS_RECORD_TYPE; return true; } _isGFF = true; _recordType = GFF_RECORD_TYPE; return true; } //Here the Record must not have positions, so it is the NoPosPlus Type. _recordType = NO_POS_PLUS_RECORD_TYPE; return false; } return false; } bool FileRecordTypeChecker::isVCFformat(const char *buffer) { if (_isVCF) { return true; //previous pass through this method has determined file is VCF. } if (memcmp(buffer, "##fileformat=VCF", 16) == 0) { _isVCF = true; _fileType = VCF_FILE_TYPE; _recordType = VCF_RECORD_TYPE; return true; } return false; } bool FileRecordTypeChecker::isBedFormat() { //test that the file has at least three fields. //2nd and 3rd fields of first valid data line must be integers. 3rd must not be less than 2nd. if (_numFields < 3) { return false; } //the 2nd and 3rd fields must be numeric. if (!isInteger(_tokenizer.getElem(1)) || !isInteger(_tokenizer.getElem(2))) { return false; } int start = str2chrPos(_tokenizer.getElem(1)); int end = str2chrPos(_tokenizer.getElem(2)); if (end < start) { return false; } return true; } bool FileRecordTypeChecker::isGFFformat() { //a GFF file may have 8 or 9 fields. More than thats is GFFplus if (_numFields < 7 ) { return false; } //the 4th and 5th fields must be numeric. if (!isNumeric(_tokenizer.getElem(3)) || !isNumeric(_tokenizer.getElem(4))) { return false; } int start = str2chrPos(_tokenizer.getElem(3)); int end = str2chrPos(_tokenizer.getElem(4)); if (end < start) { return false; } if (_numFields > 8) { _isGFFplus = true; } return true; } bool FileRecordTypeChecker::isTextDelimtedFormat(const char *buffer, size_t len) { //Break single string buffer into vector of strings. Delimiter is newline. _tokenizer.setKeepFinalIncompleteElem(Tokenizer::IGNORE); int numLines = _tokenizer.tokenize(buffer, '\n', _eofHit, _isCompressed); //anticipated delimiter characters are tab, comma, and semi-colon. //If we need new ones, they must be added in this method. //search each line for delimiter characters. vector tabCounts; vector commaCounts; vector semicolonCounts; tabCounts.reserve(numLines); commaCounts.reserve(numLines); semicolonCounts.reserve(numLines); //loop through the lines, ignoring headers. Count potential delimiters, //see if we can find a few lines with the same number of a given delimiter. //delims are tested in hierarchical order, starting with tab,then comma, then semi-colon. int validLinesFound=0; int headerCount = 0; int emptyLines = 0; for (int i=0; i < numLines; i++ ) { if (validLinesFound >=4) { break; //really only need to look at like 4 lines of data, max. } const string line = _tokenizer.getElem(i); //skip over any empty line if (line.size() == 0) { emptyLines++; continue; } // //skip over any header line // if (_inheader) { headerCount++; _inheader = false; //inheaders can only apply to first line continue; } if (isHeaderLine(line)) { //clear any previously found supposedly valid data lines, because valid lines can only come after header lines. if (_firstValidDataLineIdx > -1 && _firstValidDataLineIdx < i) { _firstValidDataLineIdx = -1; validLinesFound--; headerCount++; } headerCount++; continue; } //a line must have some alphanumeric characters in order to be valid. bool hasAlphaNum = false; for (int j=0; j < len; j++) { if(isalnum(line[j])) { hasAlphaNum = true; break; } } if (!hasAlphaNum) { continue; } validLinesFound++; if (_firstValidDataLineIdx == -1) { _firstValidDataLineIdx = i; } int tab_count = std::count(line.begin(), line.end(), '\t'); int comma_count = std::count(line.begin(), line.end(), ','); int semicolon_count = std::count(line.begin(), line.end(), ';'); tabCounts.push_back(tab_count); commaCounts.push_back(comma_count); semicolonCounts.push_back(semicolon_count); } if (headerCount + emptyLines == numLines) { _insufficientData = true; } if (validLinesFound == 0) { return false; } _insufficientData = false; if (delimiterTesting(tabCounts, '\t')) { return true; } else if (validLinesFound) { return true; } if (delimiterTesting(commaCounts, ',')) { return true; } if (delimiterTesting(semicolonCounts, ';')) { return true; } return false; //unable to detect delimited file. } bool FileRecordTypeChecker::delimiterTesting(vector &counts, char suspectChar) { //check to see if we found the same number of tabs in every line. int numDelims = counts[0]; if (numDelims != 0) { bool countsMatch = true; for (int i=1; i < (int)counts.size(); i++) { if (counts[i] != numDelims) { countsMatch = false; } } if (countsMatch) { //Hurray!! We have successfully found a delimited file. _isDelimited = true; _delimChar = suspectChar; _numFields = numDelims + 1; return true; } else { return false; } } else { // there is just a single column with no delimiter. _numFields = 1; return false; } } void FileRecordTypeChecker::setBam() { _fileType = BAM_FILE_TYPE; _recordType = BAM_RECORD_TYPE; _isBinary = true; _isBAM = true; } bool FileRecordTypeChecker::passesBed5() { return _isBed && _numFields == 5 && isNumeric(_tokenizer.getElem(4)); } bool FileRecordTypeChecker::passesBed6() { return (_isBed && _numFields == 6 && isStrandField(5)); } bool FileRecordTypeChecker::passesBed12() { return (isStrandField(5) && isNumeric(_tokenizer.getElem(6)) && isNumeric(_tokenizer.getElem(7)) && isNumeric(_tokenizer.getElem(9))); } bool FileRecordTypeChecker::isStrandField(int field) { const string &strandChar = _tokenizer.getElem(field); return (strandChar == "+" || strandChar == "-" || strandChar == "."); }