#! /usr/bin/perl use strict; use warnings; # https://www.perl-community.de/bat/poard/thread/18288 use File::Find; # for searching for files/directories use File::Spec::Functions qw( catfile ); # for creating portable file-paths use Cwd; # for determing current work directory my @directories = ( # where do you want to search? '/dev/shm/data', # herein are the .ffn files; ADJUST THIS PATH ); my $outfile_name = 'combined.ffn'; my %match; # storage for matching file paths # this routine extracts and combines the data sub combine_to { my $newfile = shift; # output filename my $directory = shift; # files' directory my $filesRef = shift; # reference to array of filenames # create full path of output file my $outfile = catfile( $directory, $newfile ); my $organism = ''; # name of organism my @sequences; # list of extracted gene sequences # open output file for writing open my $outfh, '>', $outfile or die "open(w, $outfile) failed: $!\n"; # dereference $filesRef and iterate through the input filenames for my $infile ( @{ $filesRef } ) { # create full path of each input file $infile = catfile( $directory, $infile ); # open input file for reading open my $infh, '<', $infile or die "open(ro,$infile) failed: $!\n"; # read input file linewise while ( my $line = <$infh> ) { # skip empty lines next if $line =~ m{^\s*$}; # identify line protein info and name of organism if ( $line =~ m/^>.+\[([^]]+)\]/ ) { # store name of organism only if not known yet $organism = $1 if $organism eq ''; # read next line to extract beginning of sequence $line = <$infh>; # extract sequence information if ( $line =~ m/\A...([ATGC]{25})/ ) { push @sequences, $1; } } } close $infh; } # print collected data to output file print $outfh "> $organism\n", @sequences; close $outfh or die "close($outfile) failed: $!\n"; } sub find_ffn_files { return if ! -d $File::Find::name; # skip if not a directory my $dir = $File::Find::name; # short name of directory opendir my $dh, $dir or die "Cannot open '$dir': $!\n"; # read '.ffn' files from directory and create full file path my @files = grep { m/\.ffn$/ && $_ ne $outfile_name } readdir $dh; closedir $dh; # create hash of array for matches; # we must have found 1 or exactly 2 files $match{$dir} = \@files if @files && 2 >= @files; } # search for files and fill @matches find( \&find_ffn_files, @directories ); # check %match for my $dir ( keys %match ) { # combine found files into 'combined.ffn' in corresponding directory combine_to( $outfile_name, $dir => $match{$dir} ); }