When trying to predict protein-coding sequences in DNA sequences, information about the control signalas is often lacking, so that one needs to try 6 possible reading reams, three for each DNA strand. Each reading frame starts one nucleotide further giving rise to a different protein. Because there is not overlap, we can use regular expression to group three bases and match the sequence between two stop codens. Here I add "_" to group every three bases.
# !/user/bin/perl -w
# delete the space
sub trim($) {
my $string = shift;
$string =~ s/^\s+//;
$string =~ s/\s+$//;
return $string;
}
sub add_underline(@) {
my $string = shift;
my $start = shift;
my $result = '';
for ( ; $start < length($string) ; $start += 3 ) {
$result = $result . '_' . substr( $string, $start, 3 );
}
return $result;
}
# get open reading frames from one sequence
sub get_orf($) {
my $string = shift;
my @list;
while ( $string =~ m/((TAA_|TAG_|TGA_)(([ATGC]{3}_)+?)(TAA|TAG|TGA))/g ) {
push @list, $1;
}
return \@list;
}
sub delete_underline($) {
my $string = shift;
$string =~ s/[_]//g;
return $string;
}
# get open reading frames from one sequence
# shift one elment step by step
sub get_orfs(@) {
my $sequence = shift;
my $result_list = shift;
for ( my $k = 0 ; $k < 3 ; $k++ ) {
my $s = add_underline( $sequence, $k );
my $list = get_orf($s);
foreach my $i (@$list) {
my $j = delete_underline($i);
push @$result_list, $j;
}
}
}
# get the complementary of one DNA strand.
sub get_complement(@) {
my $string = shift;
my $new_seq = "";
foreach $c (split(//, $string)) {
if ($c eq 'A') {
$new_seq .= 'T';
}
elsif ($c eq 'T') {
$new_seq .= 'A';
}
elsif ($c eq "G") {
$new_seq .= 'C';
}
else {
$new_seq .= 'G';
}
}
return $new_seq;
}
sub main() {
my $sequence = '';
while ( my $line = <STDIN> ) {
$line = trim($line);
if ( substr( $line, 0, 1 ) eq '>' ) {
if ( length($sequence) != 0 ) {
break;
}
}
elsif ( length($line) != 0 ) {
$sequence .= $line;
}
}
my @list;
get_orfs($sequence, \@list);
$sequence = get_complement($sequence);
get_orfs($sequence, \@list);
@list = sort {length($b) <=> length($a)} @list;
foreach my $i (@list) {
print ">" . length($i) . "\n" . $i . "\n\n";
}
}
main();
updated 10/05/2010
Copyright (c) 2010 Yifan Peng