#! /usr/bin/perl
use strict;
use warnings;

# https://www.perl-community.de/bat/poard/thread/18288

use File::Find;                             # for searching for files/directories
use File::Spec::Functions qw( catfile );    # for creating portable file-paths
use Cwd;                                    # for determing current work directory

my @directories = (                         # where do you want to search?
    '/dev/shm/data',                        # herein are the .ffn files; ADJUST THIS PATH
);

my $outfile_name = 'combined.ffn';


my %match;                                  # storage for matching file paths


# this routine extracts and combines the data
sub combine_to {

    my $newfile   = shift;                  # output filename
    my $directory = shift;                  # files' directory
    my $filesRef  = shift;                  # reference to array of filenames

    # create full path of output file
    my $outfile = catfile( $directory, $newfile );

    my $organism = '';                      # name of organism
    my @sequences;                          # list of extracted gene sequences


    # open output file for writing
    open my $outfh, '>', $outfile or die "open(w, $outfile) failed: $!\n";

    # dereference $filesRef and iterate through the input filenames
    for my $infile ( @{ $filesRef } ) {

        # create full path of each input file
        $infile = catfile( $directory, $infile );

        # open input file for reading
        open my $infh, '<', $infile or die "open(ro,$infile) failed: $!\n";

        # read input file linewise
        while ( my $line = <$infh> ) {

            # skip empty lines
            next    if $line =~ m{^\s*$};

            # identify line protein info and name of organism
            if ( $line =~ m/^>.+\[([^]]+)\]/ ) {
                # store name of organism only if not known yet
                $organism = $1  if $organism eq '';

                # read next line to extract beginning of sequence
                $line = <$infh>;
                # extract sequence information
                if ( $line =~ m/\A...([ATGC]{25})/ ) {
                    push @sequences, $1;
                }
            }
        }

        close $infh;
    }

    # print collected data to output file
    print $outfh "> $organism\n", @sequences;

    close $outfh    or die "close($outfile) failed: $!\n";
}


sub find_ffn_files {

    return if ! -d $File::Find::name;       # skip if not a directory

    my $dir = $File::Find::name;            # short name of directory

    opendir my $dh, $dir   or die "Cannot open '$dir': $!\n";
    # read '.ffn'  files from directory and create full file path
    my @files = grep { m/\.ffn$/ && $_ ne $outfile_name } readdir $dh;
    closedir $dh;

    # create hash of array for matches;
    # we must have found 1 or exactly 2 files
    $match{$dir} = \@files      if @files && 2 >= @files;
}

# search for files and fill @matches
find( \&find_ffn_files, @directories );

# check %match
for my $dir ( keys %match ) {

    # combine found files into 'combined.ffn' in corresponding directory
    combine_to( $outfile_name, $dir => $match{$dir} );

}