#!/usr/bin/perl -w use strict; use Bio::SeqIO; use Bio::Seq; # the input arguments are # 1. a directory of genbank genome files (*.gbk) # 2. a filename for our coding sequences to be written to my $dir = $ARGV[0]; my $outfile = $ARGV[1]; my $output = Bio::SeqIO->new(-format => 'fasta', -file => ">$outfile.cds"); my $output_protein = Bio::SeqIO->new(-format => 'fasta', -file => ">$outfile.pep"); opendir(DIR, $dir) || die("cannot open directory '$dir': $!"); my @files = readdir(DIR); my %unique_loci; foreach my $file ( @files ) { print "file is $file\n"; # ABC.gbk.HIJ <-- don't want these # ABC.gbk <-- we want these if( $file =~ /\.gbk$/) { print "Opening $dir/$file...\n"; my $input = Bio::SeqIO->new(-format => 'genbank', -file => "$dir/$file"); while( my $seq = $input->next_seq ) { # get all the feature from the sequence object my @features = $seq->get_SeqFeatures; foreach my $f ( @features ) { # test if the feature TYPE is 'CDS' if( $f->primary_tag eq 'CDS' ) { my $name; if( $f->has_tag('locus_tag') ) { ($name) = $f->get_tag_values('locus_tag'); } else { warn("cannot find a locus tag for this feature ", $f->gff_string, "\n"); } $unique_loci{$name}++; if( $unique_loci{$name} > 1 ) { warn("Locus $name has been seen $unique_loci{$name} times\n"); $name .= ".". $unique_loci{$name}; } my ($desc) = $f->get_tag_values('product'); my $cds_seq = $f->spliced_seq; $cds_seq->display_id($name); $cds_seq->description($desc); $output->write_seq($cds_seq); $output_protein->write_seq($cds_seq->translate); } } } } }