How to extract open reading frames from DNA sequence

When trying to predict protein-coding sequences in DNA sequences, information about the control signalas is often lacking, so that one needs to try 6 possible reading reams, three for each DNA strand. Each reading frame starts one nucleotide further giving rise to a different protein. Because there is not overlap, we can use regular expression to group three bases and match the sequence between two stop codens. Here I add "_" to group every three bases.

		
# !/user/bin/perl -w

# delete the space
sub trim($) {
   my $string = shift;
   $string =~ s/^\s+//;
   $string =~ s/\s+$//;
   return $string;
}

sub add_underline(@) {
   my $string = shift;
   my $start  = shift;
   my $result = '';
   for ( ; $start < length($string) ; $start += 3 ) {
      $result = $result . '_' . substr( $string, $start, 3 );
   }
   return $result;
}

# get open reading frames from one sequence
sub get_orf($) {
   my $string = shift;
   my @list;
   while ( $string =~ m/((TAA_|TAG_|TGA_)(([ATGC]{3}_)+?)(TAA|TAG|TGA))/g ) {
      push @list, $1;
   }
   return \@list;
}

sub delete_underline($) {
   my $string = shift;
   $string =~ s/[_]//g;
   return $string;
}

# get open reading frames from one sequence
# shift one elment step by step
sub get_orfs(@) {
   my $sequence = shift;
   my $result_list = shift;
   for ( my $k = 0 ; $k < 3 ; $k++ ) {
      my $s = add_underline( $sequence, $k );
      my $list = get_orf($s);
      foreach my $i (@$list) {
         my $j = delete_underline($i);
         push @$result_list, $j;
      }
   }
}

# get the complementary of one DNA strand.
sub get_complement(@) {
   my $string = shift;
   my $new_seq = "";
   foreach $c (split(//, $string)) {
      if ($c eq 'A') {
         $new_seq .= 'T';
      }
      elsif ($c eq 'T') {
         $new_seq .= 'A';
      }
      elsif ($c eq "G") {
         $new_seq .= 'C';
      }
      else {
         $new_seq .= 'G';
      }
   }
   return $new_seq;
}

sub main() {
   my $sequence = '';
   while ( my $line = <STDIN> ) {
      $line = trim($line);
      if ( substr( $line, 0, 1 ) eq '>' ) {
         if ( length($sequence) != 0 ) {
            break;
         }
      }
      elsif ( length($line) != 0 ) {
         $sequence .= $line;
      }
   }

   my @list;
   get_orfs($sequence, \@list);
   $sequence = get_complement($sequence);
   get_orfs($sequence, \@list);
   
   @list = sort {length($b) <=> length($a)} @list;
   foreach my $i (@list) {
      print ">" . length($i) . "\n" . $i . "\n\n";
   }
}

main();

	

updated 10/05/2010

Copyright (c) 2010 Yifan Peng