#!/usr/bin/perl ##---------------------------------------------------------------------------## ## File: ## @(#) RepbaseEMBL.pm ## Author: ## Robert M. Hubley rhubley@systemsbiology.org ## Description: ## A simple EMBL object. This is a simple datastructure ## for encapsulating the EMBL formatted sequence data and ## annotations. **THIS IS NOT A COMPLETE IMPLEMENTATION OF ## THE EMBL FORMAT** but rather restricted to the features ## currently used by Repbase. Two methods provide the ability ## to read/write Repbase EMBL formatted files. ## #****************************************************************************** #* Copyright (C) Institute for Systems Biology 2003-2004 Developed by #* Robert Hubley. #* #* This work is licensed under the Open Source License v2.1. To view a copy #* of this license, visit http://www.opensource.org/licenses/osl-2.1.php or #* see the license.txt file contained in this distribution. #* #****************************************************************************** # Implementation Details: # # The internal data storage is defined below. Data accessor # methods have not been built ( but should be before external # access is allowed ). Currently the file's records ( defined # below ) are stored in an array: # # bless( {'records'} => [ {#EMBL Record}, {#EMBL Record}, .. ]}, # 'RepbaseEMBL' ); # # Repbase/EMBL Record EMBL Tag # ------------------------------------------------ -------- # { 'id' => blah, ID # 'dataclass' => blah, ID # 'molecule' => blah, ID # 'divisions' => [ blah, blah, blah ], ID # 'accessions' => [ blah, blah, blah ], AC # 'dateCreated' => blah, DT # 'dateUpdated' => blah, DT # 'comments' => [ blah, blah, blah, blah ], CC # 'keywords' => [ blah, blah, blah, blah ], KW # 'description' => blah, DE # 'speciesName' => blah, OS # 'commonSpeciesName' => blah, OS # 'classification' => [ blah, blah, blah, blah ], OC # 'references' => [ { 'number' = > blah, RN # 'comment' => blah, RC # 'position' => blah, RP # 'xref' => blah, RX # 'authors' => blah, RA # 'title' => blah, RT # 'location' => blah } ], RL # 'xref' => blah, DR # 'length' => blah, ID # 'composition' => { 'A' => blah, SQ # 'C' => blah, SQ # 'G' => blah, SQ # 'T' => blah, SQ # 'other' => 'blah' }, SQ # 'sequence' => blah }; SQ # #****************************************************************************** # =head1 NAME RepbaseEMBL =head1 SYNOPSIS use RepbaseEMBL Usage: $DB = RepbaseEMBL->new( fileName => "humrep.ref" ); =head1 DESCRIPTION A simple EMBL object. This is a simple datastructure for encapsulating the EMBL formatted sequence data and annotations. **THIS IS NOT A COMPLETE IMPLEMENTATION OF THE EMBL FORMAT** but rather restricted to the features currently used by Repbase. Two methods provide the ability to read/write Repbase EMBL formatted files. Caveats: - Currently no attempt is made to store the ordering of tag groups. The output ordering is fixed and may not reflect the original input ordering. Ie. it appears there is a convention of using a "CC" tag following an "OS" tag to store the common name of a speciess: OS Anopheles gambiae CC African malaria mosquito. The EMBL Standard requires that this information be placed on the "OS" line and surrounded by parantheses. The reader does not currently differentiate between "CC" locations in a record. Therefore this will be reformatted upon output as: OS Anopheles gambiae OC .. .. XX CC African malaria mosquito. - The line breaks/spacing in comment blocks are potentially important. This object attempts to keep save the line break and spacing information for several types of data including comment tags. It will be the responsibility of the data user to verify line width and reformat paragraphs as the data is manipulated. =head1 INSTANCE METHODS =cut package RepbaseEMBL; use strict; use PubRef; use RepbaseRecord; use Carp; use Data::Dumper; use Text::Wrap; use vars qw(@ISA @EXPORT @EXPORT_OK %EXPORT_TAGS $VERSION); require Exporter; @ISA = qw(); @EXPORT = qw(); @EXPORT_OK = qw(); %EXPORT_TAGS = ( all => [ @EXPORT_OK ] ); # # Version # my $VERSION = "0.1"; my $CLASS = "RepbaseEMBL"; # # EMBL Tags used by Repbase # my %validTagBits = ( 'ID' => 1, 'CC' => 2, 'AC' => 4, 'DT' => 8, 'DE' => 16, 'KW' => 32, 'OS' => 64, 'OC' => 128, 'RN' => 256, 'RP' => 512, 'RC' => 1024, 'RA' => 2048, 'RT' => 4096, 'RL' => 8192, 'DR' => 16384, 'SQ' => 32768, 'RX' => 65536, 'FT' => 131072, 'RG' => 262144, 'FH' => 524288, 'NM' => 1048576 ); ##-------------------------------------------------------------------------## ## Constructor ##-------------------------------------------------------------------------## sub new { my $class = shift; my %nameValuePairs = @_; # Create ourself as a hash my $this = {}; $this->{'records'} = []; # Bless this hash in the name of the father, the son... bless $this, $class; if ( %nameValuePairs ) { my %validKeys = ( fileName => 1 ); while ( my ( $name, $value ) = each( %nameValuePairs ) ) { unless ( exists $validKeys{$name} ) { croak "$CLASS::new: Error $name is not a valid " . "parameter to the constructor!\n\n" . "Usage: \$DB = $CLASS->new();\n" . " \$DB = $CLASS->new( fileName=>\"humrep.ref\" );\n"; } $this->{$name} = $value; } } if ( exists $this->{'fileName'} ) { _parseFromFile( $this, $this->{'fileName'} ); } return $this; } ##-------------------------------------------------------------------------## ## Get and Set Methods ##-------------------------------------------------------------------------## ##-------------------------------------------------------------------------## ## General Object Methods ##-------------------------------------------------------------------------## ##-------------------------------------------------------------------------## =head2 Use: $DB->writeEMBLFile( $fileName ); Write out this object as an EMBL file; =cut ##-------------------------------------------------------------------------## sub writeEMBLFile { my $this = shift; my $fileName = shift; open OUT, ">$fileName" or die "$CLASS::writeEMBLFile() Unable to open " . "file $fileName for writing: $!\n"; foreach my $record ( @{ $this->{'records'} } ) { print OUT "" . &toEMBLString( $record ); } close OUT; } ##-------------------------------------------------------------------------## =head2 Use: my $string = toEMBLString( $recordRef ); Convert a single EMBL record back to EMBL format. NOTE: This is a static method. =cut ##-------------------------------------------------------------------------## sub toEMBLString { my $recRef = shift; # EMBL Data Structure my $outStr = toEMBLHeader( $recRef ); my $seq = $recRef->getSequence(); my $seqIndex = 0; while ( ( $seq =~ s/^(.{60})// ) ) { $seqIndex += 60; my $lineSeq = $1; $lineSeq =~ s/(\w{10})/$1 /g; $outStr .= " " . $lineSeq . " $seqIndex\n"; } $seqIndex += length( $seq ); $seq =~ s/(\w{10})/$1 /g; my $numPadding = 68 - length( $seq ); $outStr .= " " . $seq . ( " " x $numPadding ) . $seqIndex . "\n"; $outStr .= "//\n"; return $outStr; } sub toEMBLHeader { my $recRef = shift; # EMBL Data Structure my $tmpStr = ""; my $outStr = ""; $outStr = "ID " . $recRef->getId() . " " . $recRef->getDataclass() . "; " . $recRef->getMolecule() . "; " . $recRef->getDivisionsElement( 0 ) . "; " . $recRef->getLength() . " BP.\n"; $outStr .= "XX\n"; # my $accessionStr = ""; if ( $recRef->getAccessionsCount() > 0 ) { $accessionStr = join( "; ", $recRef->getAccessionsArray() ); } ## TODO Need a routine to break lines here $accessionStr =~ s/^/AC /g; $outStr .= $accessionStr . "\n"; $outStr .= "XX\n"; # if ( $recRef->getDateCreated() ne "" ) { $outStr .= "DT " . $recRef->getDateCreated() . "\n"; } if ( $recRef->getDateUpdated() ne "" ) { $outStr .= "DT " . $recRef->getDateUpdated() . "\n"; } $outStr .= "XX\n"; # if ( $recRef->getDescription() ne "" ) { $tmpStr = $recRef->getDescription(); $tmpStr =~ s/^/DE /gm; $outStr .= $tmpStr; $outStr .= "XX\n"; } # # Keywords # if ( defined $recRef->getKeywordsArray() ) { $tmpStr = join( "; ", $recRef->getKeywordsArray() ); $outStr .= "KW $tmpStr.\n"; $outStr .= "XX\n"; } # # Organism Name # $outStr .= "OS " . $recRef->getSpeciesName(); if ( $recRef->getCommonSpeciesName() ne "" ) { $outStr .= "(" . $recRef->getCommonSpeciesName() . ")"; } $outStr .= "\n"; # # Organism Classification # if ( $recRef->getClassificationCount() > 0 ) { $outStr .= "OC "; $tmpStr = join( "; ", $recRef->getClassificationArray() ) . "."; # TODO Break up lines! $outStr .= $tmpStr . "\n"; $outStr .= "XX\n"; } # # References # foreach my $reference ( $recRef->getReferencesArray() ) { $outStr .= "RN " . $reference->getNumber(); if ( $reference->getPosition() ne "" ) { $tmpStr = $reference->getPosition(); $tmpStr =~ s/^/RP /gm; $outStr .= $tmpStr; } if ( $reference->getComment ne "" ) { $tmpStr = $reference->getComment(); $tmpStr =~ s/^/RC /gm; $outStr .= $tmpStr; } if ( $reference->getAuthors() ne "" ) { $tmpStr = $reference->getAuthors(); $tmpStr =~ s/^/RA /gm; $outStr .= $tmpStr; } if ( $reference->getTitle() ne "" ) { $tmpStr = $reference->getTitle(); $tmpStr =~ s/^/RT /gm; $outStr .= $tmpStr; } if ( $reference->getLocation() ne "" ) { $tmpStr = $reference->getLocation(); $tmpStr =~ s/^/RL /gm; $outStr .= $tmpStr; } if ( $reference->getXref() ne "" ) { $tmpStr = $reference->getXref(); $tmpStr =~ s/^/RX /gm; $outStr .= $tmpStr; } $outStr .= "XX\n"; } # # Database references # if ( $recRef->getXref() ne "" ) { $tmpStr = $recRef->getXref(); $tmpStr =~ s/^/DR /gm; $outStr .= $tmpStr; $outStr .= "XX\n"; } # # Comments # foreach my $comment ( $recRef->getCommentsArray() ) { $comment =~ s/^/CC /gm; $outStr .= $comment; $outStr .= "XX\n"; } # # Sequence # $outStr .= "SQ Sequence " . $recRef->getLength() . " BP;"; foreach my $base ( "A", "C", "G", "T", "other" ) { if ( defined $recRef->getCompositionElement( $base ) ) { $outStr .= " " . $recRef->getCompositionElement( $base ) . " $base;"; } } $outStr .= "\n"; return ( $outStr ); } sub getRecordCount { my $this = shift; return ( $#{ $this->{'records'} } + 1 ); } sub getRecord { my $this = shift; my $index = shift; return if ( $index < 0 || $index > $#{ $this->{'records'} } ); return $this->{'records'}->[ $index ]; } sub getRelease { my $this = shift; return $this->{'release'}; } ##-------------------------------------------------------------------------## ## Private Object Methods ##-------------------------------------------------------------------------## ##-------------------------------------------------------------------------## ## Use: my _parseFromFile( $obj, $filename ); ## ## Parse a FASTA output file into this datastructure. ## ##-------------------------------------------------------------------------## sub _parseFromFile { my $this = shift; my $filename = shift; open IN, "<$filename" || die "$CLASS::_parseFromFile() Unable to open " . "Repbase/EMBL file $filename: $!\n"; my $prevTag = ""; my $tag = ""; my $data = ""; my $recordRef = RepbaseRecord->new(); my $firstRecordFound = 0; while ( ) { ## Ignore spacers next if ( /^XX/ ); ## Get file release if it exists if ( /^CC\s+RELEASE\s+(\d+)/ ) { $this->{'release'} = $1; } ## Process the end record tag if ( /^\/\/\s*$/ || eof( IN ) ) { if ( $data ne "" ) { # Parse last tag _parseTagData( $tag, $data, $recordRef ); # Save record! push @{ $this->{'records'} }, $recordRef; $recordRef = RepbaseRecord->new(); } $data = ""; next; } ## Capture the tag! if ( /^(\S\S)(?:\s+(\S.*))?/ ) { $prevTag = $tag; $tag = $1; ## Check validity of the tag if ( defined $validTagBits{$1} ) { # If this tag group has ended check to see # if we have data to parse if ( $tag ne $prevTag ) { if ( $data ne "" ) { # Parse last tag _parseTagData( $prevTag, $data, $recordRef ); } # New tag so overwrite old data if ( defined $2 ) { $data = $2 . "\n"; } else { $data = "\n"; } } else { # This is a tag continuation so # append the data if ( defined $2 ) { $data .= $2 . "\n"; } else { $data .= "\n"; } } } else { croak "Error - invalid tag $1\n"; } } else { if ( /^\s+[\w\s]+\d+$/ ) { $data .= $_; } else { print STDERR "Warning...unknown stuff <$_>\n"; } } } close IN; } ##-------------------------------------------------------------------------## ## Use: my _parseTagData( $tag, $data, $recRef ); ## ## Parse a Repbase/EMBL data element. This is a helper function ## for the general parser. This function expects data for ## multiline tags to already be compressed into a single ## blob. Ie. ## ## CC line1 ## CC line2 ## ## Should be passed to this function as: ## ## $tag = "CC" ## $data = "line1\nline2" ## ##-------------------------------------------------------------------------## sub _parseTagData { my $tag = shift; my $data = shift; my $recRef = shift; return if ( $tag ne "ID" && !defined $recRef->getId() ); if ( $tag eq "ID" ) { # IDentification line if ( $data =~ /(\S+)\s+(\S+)\;\s+(\S+)\s*\;\s+(\S+)\s*\;\s+(\d+)\s+BP.*/ ) { $recRef->setId( $1 ); $recRef->setDataclass( $2 ); $recRef->setMolecule( $3 ); $recRef->pushDivisions( $4 ); } else { croak "parseTagData: ID field not to EMBL" . " spec \"$data\"\n"; } } elsif ( $tag eq "CC" ) { if ( $data =~ /RepeatMasker\s+Annotations/i ) { if ( $data =~ /Type:[ \t]*(\S+)[ \t]*[\n\r]/i ) { $recRef->setRMType( $1 ); } if ( $data =~ /SubType:[ \t]*(\S+)[ \t]*[\n\r]/i ) { $recRef->setRMSubType( $1 ); } # TODO: Decide if we want to be consistent with # other lists...ie. place a coma between entities if ( $data =~ /Species:[ \t]*([\S \t]+)[ \t]*[\n\r]/i ) { my $speciesStr = $1; $speciesStr =~ s/\s//g; $speciesStr =~ s/(.*),$/$1/g; my @species = split( /\,/, $speciesStr ); foreach my $species ( @species ) { if ( $species =~ /Search/ ) { die "Hmmm $speciesStr\n"; } $species =~ s/\s//g; $recRef->pushRMSpecies( $species ); } } if ( $data =~ /Refineable/ ) { $recRef->setRMRefineable( 1 ); } if ( $data =~ /SearchStages:\s*([\d\s*\,\s*]+)[\n\r]/i ) { my $stageStr = $1; $stageStr =~ s/\s//g; $stageStr =~ s/(.*),$/$1/g; my @stages = split( /\,/, $stageStr ); foreach my $stage ( @stages ) { $recRef->pushRMSearchStages( $stage ); } } if ( $data =~ /BufferStages:([^\n\r]*)[\n\r]+/i ) { my $bufData = $1; while ( $bufData =~ s/\s*(\d+\s*(?:\[\s*\d+\s*-\s*\d+\s*\]\s*)*),?//m ) { my $bufStr = $1; $bufStr =~ s/\s//g; if ( $bufStr ne "" ) { $recRef->pushRMBufferStages( $bufStr ); } } } if ( $data =~ /Description:\s+(\S.*)/i ) { $recRef->setRMDescription( $1 ); } } # Coment line # Multi-record/multi-line comment allowed $recRef->pushComments( $data ); } elsif ( $tag eq "AC" ) { # ACession line my @accessions = split( /\s*[;\n\r]\s*/, $data ); foreach my $accession ( @accessions ) { $recRef->pushAccessions( $accession ); } } elsif ( $tag eq "DT" ) { # DaTe line # Parse lines while ( $data =~ /^(.*)$/gm ) { $_ = $1; if ( /(\d+)-(\S+)(-\d+)?\s+\(Rel.\s+(\d+(\.\d+)*), Created\)/ ) { $recRef->setDateCreated( $_ ); } elsif ( /(\d+)-(\w+)(-\d+)?\s+\(Rel. (\d+(\.\d+)*), Last updated, Version (\-?\d+)\)/ ) { if ( /Version \-1/ ) { s/Version \-1/Version 1/g; } $recRef->setDateUpdated( $_ ); } else { croak "parseTagData: DT field not to EMBL" . " spec \"$_\"\n"; } } } elsif ( $tag eq "DE" ) { # Description line $data =~ s/[\n\r]//g; $recRef->setDescription( $data ); } elsif ( $tag eq "KW" ) { # KeyWord line my @keywords = split( /\s*[;\n\r]\s*/, $data ); # Strip last dot ( used as punctuation in RB EMBL format ) if ( $keywords[ $#keywords ] =~ /(\S+)\./ ) { $keywords[ $#keywords ] = $1; } foreach my $keyword ( @keywords ) { $recRef->pushKeywords( $keyword ); } } elsif ( $tag eq "OS" ) { ## It looks like GIRI has been using a "CC" line following ## the "OS" line to record additional information about the ## organism name. Almost looks like they are using it to store ## the common name. There is an EMBL standard for using the "OC" ## line and a pair of parentheses to store the common name. $data =~ s/[\n\r]//g; if ( ( $data =~ s/\(\s*(.*)\s*\)// ) ) { $recRef->setCommonSpeciesName( $1 ); } $recRef->setSpeciesName( $data ); } elsif ( $tag eq "OC" ) { my @classifications = split( /\s*[;\n\r]\s*/, $data ); # Strip last dot ( used as punctuation in RB EMBL format ) if ( $classifications[ $#classifications ] =~ /(\S+)\./ ) { $classifications[ $#classifications ] = $1; } foreach my $classification ( @classifications ) { $recRef->pushClassification( $classification ); } } elsif ( $tag eq "RN" ) { my $newRef = PubRef->new(); $newRef->setNumber( $data ); $recRef->pushReferences( $newRef ); } elsif ( $tag eq "RP" ) { my $lastRef = $recRef->getReferencesElement( $recRef->getReferencesCount() - 1 ); $lastRef->setPosition( $data ); } elsif ( $tag eq "RC" ) { my $lastRef = $recRef->getReferencesElement( $recRef->getReferencesCount() - 1 ); $lastRef->setComment( $data ); } elsif ( $tag eq "RA" ) { my $lastRef = $recRef->getReferencesElement( $recRef->getReferencesCount() - 1 ); $lastRef->setAuthors( $data ); } elsif ( $tag eq "RT" ) { my $lastRef = $recRef->getReferencesElement( $recRef->getReferencesCount() - 1 ); $lastRef->setTitle( $data ); } elsif ( $tag eq "RL" ) { my $lastRef = $recRef->getReferencesElement( $recRef->getReferencesCount() - 1 ); $lastRef->setLocation( $data ); } elsif ( $tag eq "RX" ) { my $lastRef = $recRef->getReferencesElement( $recRef->getReferencesCount() - 1 ); $lastRef->setXref( $data ); } elsif ( $tag eq "DR" ) { $recRef->setXref( $data ); } elsif ( $tag eq "SQ" ) { my $seq = ""; while ( $data =~ /^(.*)$/gm ) { $_ = $1; if ( /Sequence\s+(\d+)\s+BP;(?:\s+(\d+)\s+A;\s+(\d+)\s+C;\s+(\d+)\s+G;\s+(\d+)\s+T;\s+(\d+)\s+other;?)?/ ) { $recRef->setLength( $1 ); $recRef->setComposition( 'A', $2 ); $recRef->setComposition( 'C', $3 ); $recRef->setComposition( 'G', $4 ); $recRef->setComposition( 'T', $5 ); $recRef->setComposition( 'other', $6 ); } elsif ( /^\s+([acgtbdhvrykmswnxACGTBDHVRYKMSWNX\s]+)\d+$/ ) { my $seqLine .= $1; $seqLine =~ s/\s//g; $seq .= $seqLine; } else { croak "SQ record for " . $recRef->getId() . " is not valid! $_\n"; } } $recRef->setSequence( $seq ); } elsif ( $tag eq "FT" ) { # Do nothing for now } elsif ( $tag eq "RG" ) { # Do nothing for now } elsif ( $tag eq "FH" ) { # Do nothing for now } elsif ( $tag eq "NM" ) { # Do nothing for now } else { croak "Not sure how to deal with $tag\n"; } } 1;