/* * created: 2010 * * This file is part of Artemis * * Copyright(C) 2010 Genome Research Limited * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License * as published by the Free Software Foundation; either version 2 * of the License, or(at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. * */ package uk.ac.sanger.artemis.components.variant; import java.util.List; import java.util.regex.Pattern; import uk.ac.sanger.artemis.Feature; import uk.ac.sanger.artemis.FeatureVector; import uk.ac.sanger.artemis.io.Range; import uk.ac.sanger.artemis.io.RangeVector; import uk.ac.sanger.artemis.sequence.AminoAcidSequence; import uk.ac.sanger.artemis.sequence.Bases; public class VCFRecord { //private static Logger logger = Logger.getLogger(VCFRecord.class); private String chrom; private int pos; private String ID; private String ref; private VariantBase var; private float quality; private String filter; private String info; private String infos[]; private String format; private String genotypeData[][]; private short synFlag = -1; private boolean markAsNewStop = false; protected static Pattern MULTI_ALLELE_PATTERN = Pattern.compile( "^[AGCTNMRWSYKBDHVagctnmrwsykbdhv]+,[AGCTNMRWSYKBDHVagctnmrwsykbdhv,]+$"); protected static Pattern COLON_PATTERN = Pattern.compile(":"); protected static Pattern SEMICOLON_PATTERN = Pattern.compile(";"); protected static Pattern TAB_PATTERN = Pattern.compile("\\t"); /** * Return the string representation of the VCF record as a * tab-delimited string. */ public String toString() { return chrom+"\t"+pos+"\t"+ID+"\t"+ref+"\t"+var.toString()+"\t"+quality+ "\t"+filter+"\t"+info+"\t"+format+"\t"+getSampleDataString(); } /** * Parse a VCF line and return a VCFRecord * @param line * @return */ protected static VCFRecord parse(final String line, int nsamples) { final VCFRecord rec = new VCFRecord(); final String parts[] = split(line, "\t", 9+nsamples); //final String parts[] = TAB_PATTERN.split(line); rec.chrom = parts[0]; rec.pos = Integer.parseInt(parts[1]); rec.ID = parts[2]; rec.ref = parts[3]; rec.var = new VariantBase(rec, parts[4]); try { rec.quality = Float.parseFloat(parts[5]); } catch(NumberFormatException e) { rec.quality = 0.f; } rec.filter = parts[6]; rec.info = parts[7]; if(parts.length > 9) { rec.format = (parts[8]).trim(); final int nfmt = countOccurrences(rec.format, ':')+1; //rec.format.split(":").length; nsamples = parts.length-9; rec.genotypeData = new String[nsamples][nfmt]; for(int i=0; i features, int basePosition) { //logger.info("getSynFlag(List) current syn : " + synFlag + " size? " + features.size()); if(synFlag == -1) this.synFlag = isSynonymous(features, basePosition); return synFlag; } /** * @param features * @param basePosition * @return * 0 if non-synonymous; * 1 if synonymous; * 2 if non-synonymous and creates a stop codon * 3 not within a gene */ private short isSynonymous(FeatureVector features, int basePosition) { char variant = getAlt().toString().toLowerCase().charAt(0); for(int i = 0; i - 1) return isSyn; } return 3; } private short isSynonymous(List features, int basePosition) { char variant = getAlt().toString().toLowerCase().charAt(0); for(CDSFeature feature : features) { short isSyn = checkSyn(feature, basePosition, variant); if(isSyn > - 1) return isSyn; } return 3; } protected static short checkSyn(CDSFeature gfeat, int basePosition, char variant) { //logger.info("CDSFEATURE\t"+gfeat); //logger.info("BASEANDVARIANT\t"+basePosition + "\t" + variant); if(gfeat.firstBase < basePosition && gfeat.lastBase > basePosition) { RangeVector ranges = gfeat.ranges; for(int j=0; j< ranges.size(); j++) { Range range = (Range) ranges.get(j); if(j > 0) { if(gfeat.isFwd) gfeat.intronlength+=range.getStart()-gfeat.lastRange.getEnd()-1; else gfeat.intronlength+=gfeat.lastRange.getStart()-range.getEnd()-1; if(gfeat.intronlength < 0) gfeat.intronlength = 0; } if(range.getStart() < basePosition && range.getEnd() > basePosition) { int mod; int codonStart; if(gfeat.isFwd) { mod = (basePosition-gfeat.firstBase-gfeat.intronlength)%3; codonStart = basePosition-gfeat.firstBase-gfeat.intronlength-mod; } else { mod = (gfeat.lastBase-basePosition-gfeat.intronlength)%3; codonStart = gfeat.lastBase-basePosition-gfeat.intronlength-mod; } try { if(codonStart+3 > gfeat.bases.length()) return 0; char codon[] = gfeat.bases.substring(codonStart, codonStart + 3).toLowerCase().toCharArray(); char aaRef = AminoAcidSequence.getCodonTranslation(codon[0], codon[1], codon[2]); //logger.info(String.format("%d %d %s%s%s", mod, codonStart, codon[0],codon[1],codon[2])); if(!gfeat.isFwd) variant = Bases.complement(variant); codon[mod] = variant; //logger.info(String.format("%d %d %s%s%s", mod, codonStart, codon[0],codon[1],codon[2])); char aaNew = AminoAcidSequence.getCodonTranslation(codon[0], codon[1], codon[2]); if (aaNew == aaRef) return 1; else if(AminoAcidSequence.isStopCodon(aaNew)) return 2; else return 0; } catch(Exception e) { for(int k=0; k