#!/usr/bin/perl -w use strict; use Bio::DB::Fasta; use Bio::SeqIO; my $cluster_file = shift @ARGV; my $cluster_dir = shift @ARGV || 'clusterortho'; my $pepdir = shift @ARGV || 'proteins'; my $cdsdir = shift @ARGV || 'cds'; my $cdsdb = Bio::DB::Fasta->new($cdsdir, -glob => '*.{cds,fas,fasta,fa}'); my $pepdb = Bio::DB::Fasta->new($pepdir, -glob => '*.{pep,fas,fasta,fa}'); mkdir( $cluster_dir ); open(CLUSTERS, $cluster_file) || die "cannot open file $cluster_file: $!"; my @clusters; while(){ my ($cluster,$sp,$gene) = split; push @{ $clusters[$cluster] }, [$sp, $gene]; } my $expected_sp_num= 3; my $i = 0; for my $cluster ( @clusters ) { my %sp_count; for my $geneobj ( @$cluster ) { my ($sp,$gene) = @$geneobj; $sp_count{$sp}++; } my $skip_flag = 0; if ( scalar keys %sp_count == $expected_sp_num ) { for my $count ( values %sp_count ) { if( $count != 1 ) { $skip_flag = 1; } } } else { $skip_flag = 1; } if( $skip_flag ) { $i++; next; } my $out_c = Bio::SeqIO->new(-file => ">$cluster_dir/cluster_$i.cds.fa", -format=> 'fasta'); my $out_p = Bio::SeqIO->new(-file => ">$cluster_dir/cluster_$i.pep.fa", -format => 'fasta'); for my $geneobj ( @$cluster ) { my ($sp,$gene) = @$geneobj; my $cdsseq = $cdsdb->get_Seq_by_id($gene); my $pepseq = $pepdb->get_Seq_by_id($gene); $out_c->write_seq($cdsseq); $out_p->write_seq($pepseq); } $i++; }