#!/usr/bin/perl -w use strict; use Bio::SeqIO; use Bio::Seq; # the input arguments are # 1. a directory of genbank genome files (*.gbk) # 2. a filename for our interval sequences to be written to # get all intergenic sequences my $dir = $ARGV[0]; my $outfile = $ARGV[1]; my $output = Bio::SeqIO->new(-format => 'fasta', -file => ">$outfile.interals.fa"); opendir(DIR, $dir) || die("cannot open directory '$dir': $!"); my @files = readdir(DIR); my %unique_loci; foreach my $file ( @files ) { if( $file =~ /\.gbk$/) { print "Opening $dir/$file...\n"; my $input = Bio::SeqIO->new(-format => 'genbank', -file => "$dir/$file"); while( my $seq = $input->next_seq ) { # get all the feature from the sequence object my @features = $seq->get_SeqFeatures; my @genes; foreach my $f ( @features ) { # test if the feature TYPE is 'gene' if( $f->primary_tag eq 'gene' ) { push @genes, $f; } } @genes = sort { $a->start <=> $b->start } @genes; my @gene_names; for my $gene ( @genes ) { my ($name)= $gene->get_tag_values('gene'); if( $name ) { push @gene_names, $name; } else { push @gene_names, $gene->get_tag_values('locus_tag'); } } my $num_genes = scalar @genes; for( my $i = 0; $i < $num_genes-1; $i++ ) { if( $genes[$i]->overlaps($genes[$i+1]) ) { warn("skipping $gene_names[$i] overlaps $gene_names[$i+1]\n"); next; } my ($start,$end) = ( $genes[$i]->end, $genes[$i+1]->start ); if( $start > $end ) { warn("start > end for \n\t", $genes[$i]->gff_string, "\n\t", $genes[$i+1]->gff_string, "\n"); } my $interval = $seq->subseq($start,$end); my $id = sprintf("%s_%d_%d",$seq->accession_number,$start,$end); my $desc = sprintf("%s_%s",$gene_names[$i],$gene_names[$i+1]); my $intervalseq = Bio::Seq->new(-seq => $interval, -id => $id, -desc => $desc); $output->write_seq($intervalseq); } } } }