#!/usr/bin/perl # Task: Extract GeneID-Number and gene information and store them in a hash use strict; use warnings; my $in; my $data; my @array; my $GeneID; my @BMB; my $flag = 0; my %hash; my $gb; my $product; my $novalue = 0; # 1) open the .gff Inputfile and while reading line by line split $data at each tab and put them in the @array open $in, '<', "Genomteil.gff" or die $!; while ($data = <$in>) { @array = split (/\t/, $data); # 2) as long the flag == 0, search for the word 'gene' in the second element of @array. If you find it, set the flag ($flag=1) if ($flag==0){ if ($array[2] =~ /gene/) { $flag = 1; # 3) if the flag is set, put the three elements in my new array @BMB # now I have $BMB[0] = start point; $BMB[1] = end point; $BMB[2] = + or - @BMB = ($array[3], $array[4], $array[6]); } # 4) #if you find the word 'GeneID' extract the following number and store it in $GeneID if ($array[8] =~ /.*;db_xref=GeneID:(\d+)\n/) { $GeneID = $1; } # 5) if the flag is still set (==1), search for the word 'CDS' or 'exon' in $array[2] and put it in @BMB }elsif ($flag==1){ if ($array[2] =~ /CDS/) { push (@BMB, $array[2]); } elsif ($array[2] =~ /exon/) { push (@BMB, $array[2]); # 6) now I have $BMB [3] = CDS or exon # 7) if you find the word 'gbkey' (it's an exon!) in $array[8] store the following word (rRNA or tRNA) in $gb and add it to @BMB if ($array[8]=~ /gbkey=(\w{4})/){ $gb = $1; push (@BMB, $gb); # 8) if you don't find 'gbkey' (it's a CDS!) add $nonvalue to @BMB, which has the value 0 # now I have $BMB[4] = rRNA or tRNA or 0 }elsif ($array[8]!~/gbkey=(\w{4})/){ push (@BMB, $novalue); } # 9) if ypu find the word 'product=' followed by ';protein_' or by ';db_' extract the product name and add it to @BMB # now I have $BMB[5] = product name if ($array[8]=~ /product=(.*);protein_/){ $product = $1; push (@BMB, $product); } elsif ($array[8]=~ /product=(.*);db_/){ $product =$1; push (@BMB, $product); } # 10) create a hash, use GeneID as key and @BMB for the values $hash{$GeneID} = [ @BMB ]; # 11) if you find the term ';exon_number=1' reset the flag ($flaf=0) if ($array [8]=~ /.*;exon_number=1/){ $flag = 0; } } # 12) search the next 'gene' while flag==0 } close $in; # 13) print the keys with all their values foreach my $key (keys %hash) { foreach my $val (@{$hash{$key}}) { print "$key --> $val\n"; } }