A DNA Sequence Class in Perl
by Lincoln Stein

Example 1:
sub length {
    my $self = shift;
    return length($self->seq);
}

Example 2:
sub concatenate {
  my $self = shift;
  my ($new_seq,$prepend) = @_;
  croak "argument to concatenate must be a string or a Sequence object"
    if ref($new_seq) && !$new_seq->isa(__PACKAGE__);
  my $to_append = ref($new_seq) ? $new_seq->seq : $new_seq;

  return $self->new($prepend ? $to_append . $self->seq 
                     : $self->seq . $to_append);

Example 3: 
use overload 
    '""'       => 'asString',
    'neg'      => 'reverse',
    '.'        => 'concatenate',
    'fallback' => 'TRUE';


Example 4: 
  $a = new Sequence::Nucleotide('gattttcccgaggagagag');
  $b = new Sequence::Nucleotide('ccccgggatttccggat');
  $c = $a . $b;
  $b = "gatcgatcgaga" . $b;
  $a .= $c;
  print $a,"\n";
DNA(55 residues)
  print $a->seq,"\n";
GATTTTCCCGAGGAGAGAGGATTTTCCCGAGGAGAGAGCCCCGGGATTTCCGGAT
  

Example 5: 
sub new {
  my $class = shift;
  $class = ref($class) if ref($class);
  my ($sequence,$type) = @_;
  my $self = bless {},$class;
  if (ref($sequence)) {
    croak "Can't initialize sequence from non-Sequence object.\n"
      unless $sequence->can('seq');
    %{$self} = %{$sequence};  # clone operation
  } else {
    croak "Doesn't look like sequence data" 
      unless $sequence=~/^[gactnu ]+$/i;
    $self->{'data'} = $self->_canonicalize($sequence);
    $self->{'type'} = $type || ($sequence=~/u/i ? 'RNA' : 'DNA');
  }
  return $self;
}

Example 6: 
(a)
$sequence1 = new Sequence::Nucleotide('gatcccgatttgcagtccaaa');

(b)
$sequence2 = new Sequence::Nucleotide($sequence1);

(c)
$sequence3 = $sequence1->new('gatcccgatttgcagtccaaa');

(d)
$sequence4 = new Sequence::Nucleotide('gatcccgatttgcagtccaaa','RNA');


Listing One
package Sequence::Generic;
# File: Sequence/Generic.pm

use strict;
use Carp;
use overload 
  '""'        => 'asString',
  'neg'       => 'reverse',
  '.'         => 'concatenate',
  'fallback'  => 'TRUE';

# These methods should be overriden by child classes
# class constructor
sub new {
    my $class = shift;
    croak "$class must override the new() method";
}
# Return the sequence as a string
sub seq {
    my $self = shift;
    croak ref($self)," must override the seq() method";
}
# Return the type of the sequence as a human readable string
sub type {
    return 'Generic Sequence';
}
# These methods probably don't have to be overridden
# The length of the sequence
sub length {
    my $self = shift;
    return length($self->seq);
}
# The reverse of the sequence
sub reverse {
    my $self = shift;
    my $reversed = reverse $self->seq;
    return $reversed;
}
# A human-readable description of the object
sub asString {
  my $self = shift;
  return $self->type . '(' . $self->length . ' residues)';
}
# Concatenate two sequences together and return the result

sub concatenate {
  my $self = shift;
  my ($new_seq,$prepend) = @_;
  my ($to_append);
  if (ref($new_seq)) {
      croak "argument to concatenate must be a string or a Sequence object"
      unless $new_seq->isa(__PACKAGE__);
      $to_append = $new_seq->seq ;
  } else {
      $to_append = $new_seq;
  }
  return $self->new($prepend ? $to_append . $self->seq 
                     : $self->seq . $to_append);
}
1;


Listing Two
package Sequence::Nucleotide;
# file: Sequence/Nucleotide.pm

use Sequence::Generic;
use Sequence::Nucleotide::Subsequence;
use Sequence::Alignment;
use Carp;

use strict;
use vars '@ISA';
@ISA = 'Sequence::Generic';

my %CODON_TABLE = (
           UCA => 'S',UCG => 'S',UCC => 'S',UCU => 'S',
           UUU => 'F',UUC => 'F',UUA => 'L',UUG => 'L',
           UAU => 'Y',UAC => 'Y',UAA => '*',UAG => '*',
           UGU => 'C',UGC => 'C',UGA => '*',UGG => 'W',
           CUA => 'L',CUG => 'L',CUC => 'L',CUU => 'L',
           CCA => 'P',CCG => 'P',CCC => 'P',CCU => 'P',
           CAU => 'H',CAC => 'H',CAA => 'Q',CAG => 'Q',
           CGA => 'R',CGG => 'R',CGC => 'R',CGU => 'R',
           AUU => 'I',AUC => 'I',AUA => 'I',AUG => 'M',
           ACA => 'T',ACG => 'T',ACC => 'T',ACU => 'T',
           AAU => 'N',AAC => 'N',AAA => 'K',AAG => 'K',
           AGU => 'S',AGC => 'S',AGA => 'R',AGG => 'R',
           GUA => 'V',GUG => 'V',GUC => 'V',GUU => 'V',
           GCA => 'A',GCG => 'A',GCC => 'A',GCU => 'A',
           GAU => 'D',GAC => 'D',GAA => 'E',GAG => 'E',
           GGA => 'G',GGG => 'G',GGC => 'G',GGU => 'G',
          );
*complement = *reversec = \&reverse;

sub new {
  my $class = shift;
  $class = ref($class) if ref($class);
  my ($sequence,$type) = @_;

  my $self = bless {},$class;
  if (ref($sequence)) {
    croak "Can't initialize sequence from non-Sequence object.\n"
      unless $sequence->can('seq');
    %{$self} = %{$sequence};  # clone operation
  } else {
    croak "Doesn't look like sequence data" 
      unless $sequence=~/^[gactnu\s]+$/i;
    $self->{'data'} = $self->_canonicalize($sequence);
    $self->{'type'} = $type || ($sequence=~/u/i ? 'RNA' : 'DNA');
  }
  return $self;
}
sub seq {
    my $self = shift;
    $self->{'data'} = $self->_canonicalize($_[0])  if defined($_[0]);
    my $seq = $self->{'data'};
    return $seq unless $self->is_RNA;
    $seq=~tr/T/U/;
    return $seq;
}
sub type {
    my $self = shift;
    return defined($_[0]) ? $self->{'type'} = $_[0] : $self->{'type'};
}
sub is_DNA {
    my $self = shift;
    return $self->type eq 'DNA';
}
sub is_RNA {
  my $self = shift;
  return $self->type eq 'RNA';
}
sub subseq {
  my $self = shift;
  my ($start,$end) = @_;
  return (__PACKAGE__ . '::Subsequence')->new($self,$start,$end);
}
sub reverse {
  my $self = shift;
  return (__PACKAGE__ . '::Subsequence')->new($self,$self->length,1);
}
sub translate {
  my $self = shift;
  my $frame = shift() || 1;
  my $l = $self->length;
  my $seq = $frame > 0 ? $self->subseq($frame,$l-($l-$frame+1)%3)
              : $self->reverse->subseq(abs($frame),$l-($l-abs($frame)+1)%3);
  my $s = $seq->seq;
  $s=~tr/T/U/;  # put it in RNA mode
  $s =~ s/(\S{3})/$CODON_TABLE{$1} || 'X'/eg;
  return $s;
}
sub longest_orf {
    my $self = shift;

    my ($max,$pos,$frame);
    foreach (-3..-1,1..3) {
    my $translation = $self->translate($_);
    while ($translation=~/([^*]+)/g) {
        if (length($1) > length($max)) {
        $max = $1;
        $frame = $_;
        $pos = pos($translation) - length($max); 
        }
    }
    }
    $pos *= 3;
    $pos += abs($frame);
    return ($pos,$pos+3*length($max)-1) if $frame > 0;
    return ($self->length-$pos,$self->length-$pos-3*length($max));
}
sub align {
    my $self = shift;
    my $seq = shift;
    $seq = $seq->seq if ref($seq);
    return new Sequence::Alignment(src=>$seq,target=>$self->seq);
}
sub _canonicalize {
  my $self = shift;
  my $seq = shift;
  $seq =~ tr/uU/tT/;
  $seq =~ s/[^gatcn]//ig;
  return uc($seq);
}
1;


5


