#!/usr/bin/perl -w use strict; use Bio::SearchIO; my $dir = $ARGV[0] || '.'; opendir(DIR, $dir) || die "cannot open dir '$dir': $!"; my %result; # where we put the results of BLAST parsing my %gene2sp; for my $file ( readdir(DIR) ) { if( $file =~ /(\S+)\.(BLASTP)$/ ) { my ($stem,$ext) = ($1,$2); warn "stem is $stem for $file\n"; my ($query, $database); if( $stem =~ /(\S+)\.pep-vs-(\S+)\.pep/ ) { ($query,$database) = ($1,$2); } else { warn("Filename $file does not have a stem that I know how to match\n"); next; # go to the next $file in the loop of the readdir } warn("query= $query and database= $database\n"); my $input = Bio::SearchIO->new(-format => 'blast', -file => "$dir/$file"); while( my $r = $input->next_result ) { my $qname = $r->query_name; my $qlen = $r->query_length; $gene2sp{$qname} = $query; while( my $h = $r->next_hit ) { my $hname = $h->name; my $hlen = $h->length; next if $qname eq $hname; my $evalue = $h->significance; #while( my $hsp = $h->next_hsp ) { #} $result{$qname}->{$hname} = [$evalue]; } } } } my @clusters = &make_linkages(\%result); my $i = 0; for my $cluster (@clusters ) { for my $gene ( @$cluster ) { print join("\t", $i, $gene2sp{$gene}, $gene), "\n"; } $i++; } sub make_linkages { my ($data) = @_; my @clusters; my %seen; for my $g1 ( keys %$data ) { for my $g2 ( keys %{$data->{$g1}} ) { if( $data->{$g2}->{$g1} ) { # reciprocal, yah! my $cluster_id1 = $seen{$g1}; my $cluster_id2 = $seen{$g2}; if( defined $cluster_id1 && defined $cluster_id2 ) { if( $cluster_id1 == $cluster_id2 ) { #we've already seen and classified these genes into the same cluster next; } else { # we need to merge two separate clusters # we've already seen and classified these genes, # but they are in separate clusters #we need to join these two clusters together # so we need move all the genes into a single cluster for my $gene ( @{$clusters[$cluster_id2]} ) { push @{$clusters[$cluster_id1]}, $gene; # change the stored cluster ID for the gene $seen{$gene} = $cluster_id1; } undef $clusters[$cluster_id2]; } } elsif( defined $cluster_id1 ) { # gene1 is already assigned to cluster push @{$clusters[$cluster_id1]}, $g2; $seen{$g2} = $cluster_id1; } elsif( defined $cluster_id2 ) { # gene2 is already assigned to cluster push @{$clusters[$cluster_id2]}, $g1; $seen{$g1} = $cluster_id2; } else { # neither gene has been assigned to a cluster # need to make a new cluster # add it to the end of the list of clusters $cluster_id1 = scalar @clusters; $cluster_id2 = scalar @clusters; push @clusters, [$g1,$g2]; $seen{$g1} = $cluster_id1; $seen{$g2} = $cluster_id2; } } else { # search is not reciprocal } } } return grep { defined } @clusters; } sub unique { my %unique; return grep { ! $unique{$_}++ } @_; }