#!/usr/bin/perl -w use strict; use Bio::AlignIO; # this script will trim alignments and concatenate them together into # single alignment # it will trim both gapped columns # and cases where windows have % identity < 60% my $dir = shift @ARGV || die "need a directory"; my $window = 2; my $cutoff = 40; my $ext = "\.pep\.aln"; my @aln_array; my @files; opendir(DIR, $dir) || die $!; for my $file ( readdir(DIR) ) { if( $file =~ /$ext$/ ) { my $in = Bio::AlignIO->new(-format => 'fasta', -file => "$dir/$file"); if( my $aln = $in->next_aln ) { # do something with aln push @aln_array, $aln; push @files, $file; } } } # print the type of object that I have # print ref($object), "\n"; my %seqs; my $j = 0; for my $aln ( @aln_array ) { my $seqaln_name = $files[$j]; last if $j++ > 10; my $trimmed_aln = $aln->remove_gaps; # iterate through windows across aln for( my $i = 1; $i+$window < $trimmed_aln->length; $i += $window) { my $slice = $trimmed_aln->slice($i, $i + $window - 1); my $pid = $slice->percentage_identity; printf "%s %d %.2f\n",$seqaln_name, $i, $pid; if( $pid > $cutoff ) { # use this window for my $seq ( $slice->each_seq ) { my $id; my $seqid = substr($seq->display_id,0,1); if( $seqid eq 'b' ) { $id = 'ecoli'; } elsif( $seqid eq 'y' ) { $id = 'yersinia'; } elsif( $seqid eq 'S' ) { $id = 'salmonella'; } else { warning("seqid is $seqid for ",$seq->display_id, "\n"); $id = 'unknown'; } $seqs{$id} .= $seq->seq; } } } } my $newaln = Bio::SimpleAlign->new; for my $id ( keys %seqs ) { $newaln->add_seq( Bio::LocatableSeq->new(-display_id => $id, -seq => $seqs{$id}) ); } my $out = Bio::AlignIO->new(-file => ">concatenated_trimmedaln.nex", -format => 'nexus', -show_symbols => 0, # for mrbayes' happiness -show_endblock => 0, # ditto ); $out->write_aln($newaln);